/* * ===================================================================================== * * Filename: hello.cu * * Description: CUDA test * * Version: 1.0 * Created: 02/06/2012 03:54:42 PM * Revision: none * Compiler: gcc * * Author: Milan Kabat (), kabat@ics.muni.cz * Company: FI MUNI * * ===================================================================================== */ #include #include #include "timer-util.h" #include "gpu.cuh" #define INTS_PER_THREAD 4 void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params ); void gpu_encode ( char* source_data, char* parity, int* pc_matrix, struct coding_params ); void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params ); __global__ void encode ( int* data, int* parity, int* pcm, struct coding_params params ) { //partial result extern __shared__ int shared_mem[]; // int idx = blockIdx.x*blockDim.x + threadIdx.x; int num_packets = params.max_row_weight; int p_size = params.packet_size/4; //in each iteration, we copy corresponding packet to shared memory and XOR its content //into partial result for ( int i = 0; i < num_packets; i++ ) { int index = pcm [ blockIdx.x*(num_packets+2) + i ]; if ( index > -1 && index < params.k ) { for ( int j = 0; j < INTS_PER_THREAD; j++ ) { shared_mem [ threadIdx.x*INTS_PER_THREAD + j ] ^= data [ index*p_size + threadIdx.x*INTS_PER_THREAD + j ]; } } } for ( int j = 0; j < INTS_PER_THREAD; j++ ) { parity [ blockIdx.x*p_size + threadIdx.x*INTS_PER_THREAD + j ] = shared_mem [ threadIdx.x*INTS_PER_THREAD + j]; shared_mem [ threadIdx.x*INTS_PER_THREAD + j] = 0; } __syncthreads(); } __global__ void decode ( int* received, int* error_vec, int* pcm, struct coding_params params ) { /* extern __shared__ int shared_mem[]; * * int num_neighbours = params.max_row_weight + 2; * * int p_size = params.packet_size/sizeof(int); * int num_threads = p_size/4; * * //load the neighbouring packets into shared memory * int idx = blockIdx.x*blockDim.x + threadIdx.x; * for ( int i = 0; i < num_beighbours; i++ ) * { * if ( threadIdx.x == 0 ) * shared_mem[i] = pcm [ blockIdx.x*num_beighbours + i ]; * * __syncthreads(); * * if ( shared_mem[i] != -1 ) * { * // shared_mem [ num_neighbours + threadIdx.x*4 + i ] = * * * * * } * } * * __syncthreads(); * for ( int i = 0; i < (params.packet_size/sizeof(int))/num_threads; i++ ) * received [ (params.k + blockIdx.x)*p_size + threadIdx.x*4 + i] = * pkts [ idx + i ]; * * __syncthreads(); */ // for ( int i = 0; i < params.packet_size/sizeof(int); i++ ) // received [ params.k*p_size + blockIdx.x*p_size + idx + i ] = pkts [ idx + i ]; } void gpu_encode ( char* source_data, char* parity, int* pcm, struct coding_params params ) { int* src_data_d; int* parity_data_d; int* pcm_d; short show_info = 0; cudaError_t cuda_error; struct cudaDeviceProp dev_prop; cudaGetDeviceProperties (&dev_prop, 0); if (show_info) { if (!dev_prop.canMapHostMemory) printf("Cannot map host memory.\n"); printf ( "name: %s\n", dev_prop.name ); printf ( "totalGlobalMem: %d MB\n", (unsigned int)dev_prop.totalGlobalMem/(1024*1024) ); printf ( "sharedMemPerBlock: %d kB\n", (unsigned int)dev_prop.sharedMemPerBlock/1024 ); printf ( "maxThreadsPerBlock: %d\n", dev_prop.maxThreadsPerBlock ); printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[0] ); printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[1] ); printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[2] ); printf ( "maxGridSize: %d\n", dev_prop.maxGridSize[0] ); } // pcm = (int*) malloc (params.m*(params.max_row_weight+2)*sizeof(int*)); // make_compact_pcm ( pc_matrix, pcm, params ); cuda_error = cudaMalloc ( (void**) &src_data_d, params.k*params.packet_size); if ( cuda_error != cudaSuccess ) printf ( "cudaMalloc returned %d\n", cuda_error ); cuda_error = cudaMalloc ( (void**) &parity_data_d, params.m*params.packet_size); if ( cuda_error != cudaSuccess ) printf ( "cudaMalloc returned %d\n", cuda_error ); cuda_error = cudaMemset ( parity_data_d, 0, params.m*params.packet_size); if ( cuda_error != cudaSuccess ) printf ( "cudaMemset returned %d\n", cuda_error ); cuda_error = cudaMemcpy ( src_data_d, source_data, params.k*params.packet_size, cudaMemcpyHostToDevice ); if ( cuda_error != cudaSuccess ) printf ( "cudaMemcpy returned %d\n", cuda_error ); cuda_error = cudaMalloc ( (void**) &pcm_d, params.m*(params.max_row_weight+2)*sizeof(int)); if ( cuda_error != cudaSuccess ) printf ( "cudaMalloc return %d\n", cuda_error ); cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*params.m*(params.max_row_weight+2), cudaMemcpyHostToDevice ); if ( cuda_error != cudaSuccess ) printf ( "cudaMempcy return %d\n", cuda_error ); cuda_error = cudaDeviceSynchronize(); if ( cuda_error != cudaSuccess ) printf ( "cudaSyn returned %d\n", cuda_error ); int block_size = (params.packet_size / sizeof(int))/INTS_PER_THREAD; int block_count = params.m; int num_bytes_shared = params.packet_size; // for ( int i = 0; i < 1000; i++) encode <<< block_count, block_size, num_bytes_shared >>> (src_data_d, parity_data_d, pcm_d, params ); cuda_error = cudaGetLastError(); if ( cuda_error != cudaSuccess ) printf("kernel execution returned %d\n", cuda_error); cudaThreadSynchronize(); cudaMemcpy ( parity, parity_data_d, params.m*params.packet_size, cudaMemcpyDeviceToHost ); cuda_error = cudaGetLastError(); if ( cuda_error != cudaSuccess ) printf("cudaMemcpy from device returned %d\n", cuda_error); cudaFree(src_data_d); cudaFree(parity_data_d); } void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params params ) { /* int* received_d; * int* pcm_d; * int* error_vec_d; * cudaError_t cuda_error; * * int k = params.k; * int m = params.m; * int packet_size = params.packet_size; * * int **pcm = make_compact_pcm ( pc_matrix, params ); * * //alocate space and copy data to device * cuda_error = cudaMalloc ( (void**) &received_d, (k+m)*packet_size); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMalloc return %d\n", cuda_error ); * * cuda_error = cudaMemcpy ( received_d, received, (k+m)*packet_size, cudaMemcpyHostToDevice ); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMempcy return %d\n", cuda_error ); * * cuda_error = cudaMalloc ( (void**) &pcm_d, m*params.max_row_weight*sizeof(int)); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMalloc return %d\n", cuda_error ); * * cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*m*params.max_row_weight, * cudaMemcpyHostToDevice ); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMempcy return %d\n", cuda_error ); * * cuda_error = cudaMalloc ( (void**) &error_vec_d, params.num_lost*sizeof(int)); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMalloc return %d\n", cuda_error ); * * cuda_error = cudaMemcpy ( pcm_d, pcm, params.num_lost*sizeof(int), * cudaMemcpyHostToDevice ); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMempcy pcm return %d\n", cuda_error ); * * int block_size = (packet_size/sizeof(int)) / 4; * int block_count = m; * int shared_mem_size = (packet_size + sizeof(int))*(params.max_row_weight+2); * * decode <<< block_count, block_size, shared_mem_size >>> (received_d, error_vec, pcm_d, params ); * * cuda_error = cudaMemcpy ( received, received_d, (k+m)*packet_size, cudaMemcpyDeviceToHost ); * if ( cuda_error != cudaSuccess ) * printf ( "cudaMempcy from device return %d\n", cuda_error ); * * cudaFree ( received_d ); */ } /* void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params) * { * //we need to create a compact representation of sparse pc_matrix * * int counter = 0; * int columns = params.max_row_weight + 2; * * for ( int i = 0; i < params.m; i++) { * for ( int j = 0; j < params.k; j++) * if ( pc_matrix[i*params.k + j] ) * { * pcm[i*columns + counter] = j; * counter++; * } * //add indices from staircase matrix * pcm[i*columns + counter] = params.k + i; * counter++; * * if ( i > 0 ) * { * pcm[i*columns + counter] = params.k + i - 1; * counter++; * } * * if ( counter < columns ) * for ( int j = counter; j < columns; j++) * pcm[i*columns + j] = -1; * counter = 0; * } * * * for ( int i = 0; i < params.m; i++) * { * for ( int j = 0; j < columns; j++ ) * printf ( "%d, ", pcm[i*columns + j] ); * printf ( "\n" ); * } * * * } */