/* * ===================================================================================== * * Filename: hello.cu * * Description: CUDA test * * Version: 1.0 * Created: 02/06/2012 03:54:42 PM * Revision: none * Compiler: gcc * * Authors: Milan Kabat (kabat@ics.muni.cz), Vojtech David (374572@mail.muni.cz) * Company: FI MUNI * * ===================================================================================== */ #include #include #include #include #include #include #include "timer-util.h" #include "gpu.cuh" #include "ldgm-session-gpu.h" __device__ unsigned int count = 0; __device__ unsigned int count_M = 0; char *xor_using_sse2 (char *source, char *dest, int packet_size) { //First, do as many 128-bit XORs as possible int iter_bytes_16 = 0; int iter_bytes_4 = 0; int iter_bytes_1 = 0; iter_bytes_16 = (packet_size / 16) * 16; if ( iter_bytes_16 > 0) { // printf ( "iter_bytes: %d\n", iter_bytes ); __m128i *wrd_ptr = (__m128i *) source; __m128i *wrd_end = (__m128i *) (source + iter_bytes_16); __m128i *dst_ptr = (__m128i *) dest; // printf ( "wrd_ptr address: %p\n", wrd_ptr ); do { __m128i xmm1 = _mm_loadu_si128(wrd_ptr); __m128i xmm2 = _mm_loadu_si128(dst_ptr); xmm1 = _mm_xor_si128(xmm1, xmm2); // XOR 4 32-bit words _mm_storeu_si128(dst_ptr, xmm1); ++wrd_ptr; ++dst_ptr; } while (wrd_ptr < wrd_end); } //Check, whether further XORing is necessary if ( iter_bytes_16 < packet_size ) { char *mark_source = source + iter_bytes_16; char *mark_dest = dest + iter_bytes_16; iter_bytes_4 = ((packet_size - iter_bytes_16) / 4) * 4; for ( int i = 0; i < (packet_size - iter_bytes_16) / 4; i++) { int *s = ((int *) mark_source) + i; int *d = ((int *) mark_dest) + i; *d ^= *s; } mark_source += iter_bytes_4; mark_dest += iter_bytes_4; iter_bytes_1 = packet_size - iter_bytes_16 - iter_bytes_4; for ( int i = 0; i < iter_bytes_1; i++) { *(mark_dest + i) ^= *(mark_source + i); } } return dest; } void gpu_encode_upgrade (char * source_data,int *OUTBUF, int * PCM,int param_k,int param_m,int w_f,int packet_size ,int buf_size) { // cudaError_t error; int blocksize = packet_size/sizeof(int); // printf("blocksize: %d\npacket size: %d\n",blocksize,packet_size ); if(blocksize>256){ if(blocksize>1024) blocksize=1024; // puts("big one"); frame_encode_int_big <<< param_m, blocksize, packet_size >>> (OUTBUF,PCM, param_k, param_m, w_f, packet_size); cuda_check_error("frame_encode_int_big"); frame_encode_staircase<<< 1, blocksize, packet_size >>> (OUTBUF, PCM, param_k, param_m, w_f, packet_size); cuda_check_error("frame_encode_staircase"); cudaMemcpy(source_data + param_k*packet_size,OUTBUF + (param_k*packet_size)/4, param_m*packet_size,cudaMemcpyDeviceToHost ); // // cudaMemcpy(source_data,OUTBUF, buf_size,cudaMemcpyDeviceToHost ); cuda_check_error("memcpy out_buf"); // gettimeofday(&t0, 0); // for ( int m = 1; m < param_m; ++m) // { // char *prev_parity = (char *) source_data + (param_k + m - 1) * packet_size; // char *parity_packet = (char *) source_data + (param_k + m) * packet_size; // xor_using_sse2(prev_parity, parity_packet, packet_size); // } // gettimeofday(&t1, 0); // long elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; // printf("time staircase: %f\n",elapsed/1000.0 ); } else{ // puts("chudy soused"); frame_encode_int <<< param_m, blocksize, packet_size >>> (OUTBUF,PCM, param_k, param_m, w_f, packet_size); cuda_check_error("frame_encode_int"); cudaMemcpy(source_data + param_k*packet_size,OUTBUF + (param_k*packet_size)/4, param_m*packet_size,cudaMemcpyDeviceToHost ); // cudaMemcpy(source_data,OUTBUF, buf_size,cudaMemcpyDeviceToHost ); cuda_check_error("memcpyu out_buf"); for ( int m = 1; m < param_m; ++m) { char *prev_parity = (char *) source_data + (param_k + m - 1) * packet_size; char *parity_packet = (char *) source_data + (param_k + m) * packet_size; xor_using_sse2(prev_parity, parity_packet, packet_size); } } // cudaEvent_t start, stop; // float time; // cudaEventCreate(&start); // cudaEventCreate(&stop); // cudaEventRecord(start, 0); // cudaStream_t pStream; // cudaStreamCreate(&pStream); // frame_encode_staircase<<< 1, blocksize, packet_size >>> (OUTBUF, PCM, param_k, param_m, w_f, packet_size); // cuda_check_error("frame_encode_staircase"); // cudaStreamSynchronize(); // cudaDeviceSynchronize(); return; } void gpu_decode_upgrade(char *data, int * PCM,int* SYNC_VEC,int* ERROR_VEC, int not_done, int *frame_size,int * error_vec,int * sync_vec,int M,int K,int w_f,int buf_size,int packet_size) { cudaError_t error; int *received_d; // int M = params->m; // int K = params->k; // int w_f = params->max_row_weight + 2; // int buf_size = params->buf_size; int* received = (int*) data; // printf("K: %d, M: %d, max_row_weight: %d, buf_size: %d,\n packet_size: %d\n",K,M,w_f,buf_size,packet_size ); // printf("NOT DONE: %d\n",not_done ); error = cudaHostRegister(received, buf_size, cudaHostRegisterMapped); if (error != cudaSuccess) printf("1 %s\n", cudaGetErrorString(error)); error = cudaHostGetDevicePointer((void **) & (received_d), (void *)received, 0); if (error != cudaSuccess) printf("2 %s\n", cudaGetErrorString(error)); // error = cudaMalloc(&received_d, buf_size); // if (error != cudaSuccess) printf("1 %s\n", cudaGetErrorString(error)); // error = cudaMemcpy(received_d, received, buf_size, cudaMemcpyHostToDevice); // if (error != cudaSuccess) printf("2 %s\n", cudaGetErrorString(error)); // error = cudaMalloc(&pcm_d, w_f * M * sizeof(int)); // if (error != cudaSuccess) printf("3 %s\n", cudaGetErrorString(error)); // error = cudaMemcpy(pcm_d, PCM, w_f * M * sizeof(int), cudaMemcpyHostToDevice); // if (error != cudaSuccess) printf("4 %s\n", cudaGetErrorString(error)); // error = cudaMalloc(&error_vec_d, (K + M) * sizeof(int)); // if (error != cudaSuccess)printf("5 %s\n", cudaGetErrorString(error)); // error = cudaMemcpy(error_vec_d, error_vec, (K + M) * sizeof(int), cudaMemcpyHostToDevice); // if (error != cudaSuccess) printf("6 %s\n", cudaGetErrorString(error)); // error = cudaMalloc(&sync_vec_d, (K + M) * sizeof(int)); // if (error != cudaSuccess) printf("7 %s\n", cudaGetErrorString(error)); // error = cudaMemcpy(sync_vec_d, sync_vec, (K + M) * sizeof(int), cudaMemcpyHostToDevice); // if (error != cudaSuccess) printf("8 %s\n", cudaGetErrorString(error)); int ps = packet_size/sizeof(int); int blocksize = packet_size/sizeof(int) +1; // printf("blocksize: %d\npacket size: %d\n",blocksize,packet_size ); if(blocksize>512) blocksize=512; int not_done_source=0; for (int i = 0; i < K; i++) { if (error_vec[i] == 1) not_done_source++; } // printf("not_done %d\n",not_done ); // printf("not_done_source %d\n",not_done_source); unsigned int count_host = 0; unsigned int count_host_M = 0; error = cudaMemcpyToSymbol ( count, (void *)(&count_host), sizeof(unsigned int), 0, cudaMemcpyHostToDevice); error = cudaMemcpyToSymbol ( count_M, (void *)(&count_host_M), sizeof(unsigned int), 0, cudaMemcpyHostToDevice); int i = 0; for (i = 1; i < 30; ++i) { //__global__ void frame_decode_int(int * received, int * pcm, int * error_vec,int * sync_vec,int packet_size,int max_row_weight,int K); frame_decode_int <<< M, blocksize , packet_size >>> (received_d, PCM, ERROR_VEC, SYNC_VEC, ps, w_f-2, K); error = cudaGetLastError(); if (error != cudaSuccess) printf("3 %s\n", cudaGetErrorString(error)); // cudaDeviceSynchronize(); //error = cudaMemcpyFromSymbol((void *)(&count_host), count, sizeof(int), 0, cudaMemcpyDeviceToHost); count_host=0; error = cudaMemcpyFromSymbol((void*)(&count_host), count, sizeof(unsigned int), 0, cudaMemcpyDeviceToHost); if (error != cudaSuccess) printf("10 %s\n", cudaGetErrorString(error)); // printf("count host %d\n",count_host ); if (count_host == not_done_source) { break; } error = cudaMemcpyFromSymbol((void*)(&count_host_M), count_M, sizeof(unsigned int), 0, cudaMemcpyDeviceToHost); // printf("count host_M %d\n",count_host_M ); if (count_host_M == M) { break; } count_host_M = 0; error = cudaMemcpyToSymbol ( count_M, (void *)(&count_host_M), sizeof(unsigned int), 0, cudaMemcpyHostToDevice); } // printf("iterace: %d\n",i); // cudaDeviceSynchronize(); //cudaThreadSynchronize(); error = cudaMemcpy(error_vec, ERROR_VEC, (K + M) * sizeof(int), cudaMemcpyDeviceToHost); int a = 0; int fs = 0; for (int i = 0; i < K; i++) { if (error_vec[i] == 1) a++; } // printf("UNDECODED: %d NOT DONE: %d DEKODOVANO: %d\n",a,not_done,not_done-a); if (a != 0) { *frame_size = 0; } else { memcpy(&fs, received, 4); // printf("received size %d\n",fs ); *frame_size = fs; } // printf("undecoded: %d, frame_size: %d, undecoded subtract: %d\n",a,fs,not_done-a ); cudaHostUnregister(received); // cudaFree(received_d); // cudaFree(pcm_d); // cudaFree(error_vec_d); // cudaFree(sync_vec_d); // cudaFree(params_d); // puts("END"); return; } __global__ void frame_encode_int_big(int *data, int *pcm,int param_k,int param_m,int w_f,int packet_size) { int ps = packet_size/sizeof(int); int bx = blockIdx.x; int x = threadIdx.x; int offset; // printf("K: %d M: %d max_row_weight: %d packet_size: %d\n",param_k,param_m,max_row_weight,ps); extern __shared__ int parity_packet[]; // int *parity_packet = data + (param_k + bx) * ps; // if(x==0)printf("bx %d has parity packet at: %d,%d\n",bx,param_k*ps + bx*ps,param_k+bx ); offset = x; while (offset < ps) { parity_packet[offset]=0; offset += blockDim.x; } // __syncthreads(); for ( int i = 0; i < w_f; i++) { int idx = pcm[bx * w_f + i]; //printf ( "adept: %d\n", idx ); // if(x==0) printf ("block %d xor packet: %d\n",bx,idx); if (idx > -1 && idx < param_k) { //xoring parity_packet ^ idx offset = x; while (offset < ps) { parity_packet[offset]^=data[idx*ps + offset]; offset += blockDim.x; } } } // __syncthreads(); offset = x; while (offset < ps) { data[(param_k + bx) * ps + offset]= parity_packet[offset]; offset += blockDim.x; } } __global__ void frame_encode_int(int *data, int *pcm,int param_k,int param_m,int w_f,int packet_size) { int ps = packet_size/sizeof(int); int bx = blockIdx.x; int offset = threadIdx.x; // printf("K: %d M: %d max_row_weight: %d packet_size: %d\n",param_k,param_m,max_row_weight,ps); extern __shared__ int parity_packet[]; // int *parity_packet = data + (param_k + bx) * ps; // if(x==0)printf("bx %d has parity packet at: %d,%d\n",bx,param_k*ps + bx*ps,param_k+bx ); // while (offset < ps) // { // parity_packet[offset]=0; // offset += blockDim.x; // } parity_packet[offset]=0; // __syncthreads(); for ( int i = 0; i < w_f; i++) { int idx = pcm[bx * w_f + i]; //printf ( "adept: %d\n", idx ); // if(x==0) printf ("block %d xor packet: %d\n",bx,idx); if (idx > -1 && idx < param_k) { //xoring parity_packet ^ idx // offset = x; // while (offset < ps) // { // parity_packet[offset]^=data[idx*ps + offset]; // offset += blockDim.x; // } parity_packet[offset]^=data[idx*ps + offset]; } } // __syncthreads(); // offset = x; // while (offset < ps) // { // data[(param_k + bx) * ps + offset]= parity_packet[offset]; // offset += blockDim.x; // } data[(param_k + bx) * ps + offset]= parity_packet[offset]; // __syncthreads(); } __global__ void frame_decode_int(int *received, int *pcm, int *error_vec, int *sync_vec, int packet_size, int max_row_weight, int K) { //TITAN __shared__ int undecoded; __shared__ int undecoded_index; __shared__ int ret; extern __shared__ int shared_parity_packet[]; int w_f = max_row_weight + 2; int ps = packet_size; int bx = blockIdx.x; int x = threadIdx.x; int offset = 0; if (x == 0) { ret = 0; undecoded = 0; undecoded_index = -1; for (int j = 0; j < w_f; j++) { int p = pcm[bx * w_f + j]; //printf("%d %d %d\n",p, error_vec[p],x); if (p != -1 && error_vec[p] == 1) { undecoded++; undecoded_index = p; } } if (undecoded == 1) { ret = atomicCAS(sync_vec + undecoded_index, 1, 0); } } __syncthreads(); if (ret == 1) { // if(x==0) printf("decoding %7d, bx %7d\n",undecoded_index,bx ); offset = x; while (offset < ps) { shared_parity_packet[offset]=0x0; offset += blockDim.x; } /*int zbyva = ps - offset; if (x < zbyva) { shared_parity_packet[x + offset] = 0; }*/ __syncthreads(); // if(x==0) printf("decoding [%d]\n",undecoded_index); for (int j = 0; j < w_f; j++) { int index = pcm[bx * w_f + j]; if (index != undecoded_index && index != -1) { offset = x; while ( offset < ps) { shared_parity_packet[offset] ^= received[index*ps + offset]; offset += blockDim.x; }/* int zbyva = ps - offset; if (x < zbyva) { shared_parity_packet[x + offset] ^= received[(index * ps) + x + offset]; }*/ } } __syncthreads(); offset = x; while ( offset < ps) { // *((int *)(received + (undecoded_index * ps) + 4*x + a)) = *((int *)(shared_parity_packet + a + 4 * x)); received[(undecoded_index * ps) + offset] = shared_parity_packet[offset]; offset += blockDim.x; } /* zbyva = ps - offset; if (x < zbyva) { received[(undecoded_index * ps) + x + offset] = shared_parity_packet[x + offset]; }*/ } if (x == 0 && ret == 1) { //error_vec[undecoded_index]=0; atomicCAS(error_vec + undecoded_index, 1, 0); // printf("node %d %d done\n",undecoded_index); } if (x == 0 && ret==1 && undecoded_index