mirror of
https://github.com/outbackdingo/UltraGrid.git
synced 2026-03-20 10:40:09 +00:00
296 lines
8.9 KiB
Plaintext
296 lines
8.9 KiB
Plaintext
/*
|
|
* =====================================================================================
|
|
*
|
|
* Filename: hello.cu
|
|
*
|
|
* Description: CUDA test
|
|
*
|
|
* Version: 1.0
|
|
* Created: 02/06/2012 03:54:42 PM
|
|
* Revision: none
|
|
* Compiler: gcc
|
|
*
|
|
* Author: Milan Kabat (), kabat@ics.muni.cz
|
|
* Company: FI MUNI
|
|
*
|
|
* =====================================================================================
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <cuda.h>
|
|
|
|
#include "timer-util.h"
|
|
#include "gpu.cuh"
|
|
|
|
#define INTS_PER_THREAD 4
|
|
|
|
void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params );
|
|
|
|
void gpu_encode ( char* source_data, char* parity, int* pc_matrix, struct coding_params );
|
|
|
|
void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params );
|
|
|
|
__global__ void encode ( int* data, int* parity, int* pcm, struct coding_params params )
|
|
{
|
|
|
|
//partial result
|
|
extern __shared__ int shared_mem[];
|
|
// int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
|
int num_packets = params.max_row_weight;
|
|
int p_size = params.packet_size/4;
|
|
|
|
//in each iteration, we copy corresponding packet to shared memory and XOR its content
|
|
//into partial result
|
|
for ( int i = 0; i < num_packets; i++ )
|
|
{
|
|
int index = pcm [ blockIdx.x*(num_packets+2) + i ];
|
|
|
|
if ( index > -1 && index < params.k )
|
|
{
|
|
for ( int j = 0; j < INTS_PER_THREAD; j++ )
|
|
{
|
|
shared_mem [ threadIdx.x*INTS_PER_THREAD + j ] ^=
|
|
data [ index*p_size + threadIdx.x*INTS_PER_THREAD + j ];
|
|
}
|
|
}
|
|
}
|
|
|
|
for ( int j = 0; j < INTS_PER_THREAD; j++ )
|
|
{
|
|
parity [ blockIdx.x*p_size + threadIdx.x*INTS_PER_THREAD + j ] =
|
|
shared_mem [ threadIdx.x*INTS_PER_THREAD + j];
|
|
shared_mem [ threadIdx.x*INTS_PER_THREAD + j] = 0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
}
|
|
|
|
__global__ void decode ( int* received, int* error_vec, int* pcm, struct coding_params params )
|
|
{
|
|
/* extern __shared__ int shared_mem[];
|
|
*
|
|
* int num_neighbours = params.max_row_weight + 2;
|
|
*
|
|
* int p_size = params.packet_size/sizeof(int);
|
|
* int num_threads = p_size/4;
|
|
*
|
|
* //load the neighbouring packets into shared memory
|
|
* int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
|
* for ( int i = 0; i < num_beighbours; i++ )
|
|
* {
|
|
* if ( threadIdx.x == 0 )
|
|
* shared_mem[i] = pcm [ blockIdx.x*num_beighbours + i ];
|
|
*
|
|
* __syncthreads();
|
|
*
|
|
* if ( shared_mem[i] != -1 )
|
|
* {
|
|
* // shared_mem [ num_neighbours + threadIdx.x*4 + i ] =
|
|
*
|
|
*
|
|
*
|
|
*
|
|
* }
|
|
* }
|
|
*
|
|
* __syncthreads();
|
|
* for ( int i = 0; i < (params.packet_size/sizeof(int))/num_threads; i++ )
|
|
* received [ (params.k + blockIdx.x)*p_size + threadIdx.x*4 + i] =
|
|
* pkts [ idx + i ];
|
|
*
|
|
* __syncthreads();
|
|
*/
|
|
|
|
// for ( int i = 0; i < params.packet_size/sizeof(int); i++ )
|
|
// received [ params.k*p_size + blockIdx.x*p_size + idx + i ] = pkts [ idx + i ];
|
|
|
|
|
|
|
|
}
|
|
|
|
void gpu_encode ( char* source_data, char* parity, int* pcm, struct coding_params params )
|
|
{
|
|
int* src_data_d;
|
|
int* parity_data_d;
|
|
int* pcm_d;
|
|
short show_info = 0;
|
|
|
|
cudaError_t cuda_error;
|
|
|
|
struct cudaDeviceProp dev_prop;
|
|
|
|
cudaGetDeviceProperties (&dev_prop, 0);
|
|
|
|
if (show_info)
|
|
{
|
|
if (!dev_prop.canMapHostMemory)
|
|
printf("Cannot map host memory.\n");
|
|
printf ( "name: %s\n", dev_prop.name );
|
|
printf ( "totalGlobalMem: %d MB\n", (unsigned int)dev_prop.totalGlobalMem/(1024*1024) );
|
|
printf ( "sharedMemPerBlock: %d kB\n", (unsigned int)dev_prop.sharedMemPerBlock/1024 );
|
|
printf ( "maxThreadsPerBlock: %d\n", dev_prop.maxThreadsPerBlock );
|
|
printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[0] );
|
|
printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[1] );
|
|
printf ( "maxThreadsDim: %d\n", dev_prop.maxThreadsDim[2] );
|
|
printf ( "maxGridSize: %d\n", dev_prop.maxGridSize[0] );
|
|
}
|
|
|
|
// pcm = (int*) malloc (params.m*(params.max_row_weight+2)*sizeof(int*));
|
|
|
|
// make_compact_pcm ( pc_matrix, pcm, params );
|
|
|
|
cuda_error = cudaMalloc ( (void**) &src_data_d, params.k*params.packet_size);
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMalloc returned %d\n", cuda_error );
|
|
|
|
cuda_error = cudaMalloc ( (void**) &parity_data_d, params.m*params.packet_size);
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMalloc returned %d\n", cuda_error );
|
|
|
|
cuda_error = cudaMemset ( parity_data_d, 0, params.m*params.packet_size);
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMemset returned %d\n", cuda_error );
|
|
|
|
cuda_error = cudaMemcpy ( src_data_d, source_data, params.k*params.packet_size,
|
|
cudaMemcpyHostToDevice );
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMemcpy returned %d\n", cuda_error );
|
|
|
|
cuda_error = cudaMalloc ( (void**) &pcm_d, params.m*(params.max_row_weight+2)*sizeof(int));
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMalloc return %d\n", cuda_error );
|
|
|
|
cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*params.m*(params.max_row_weight+2),
|
|
cudaMemcpyHostToDevice );
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaMempcy return %d\n", cuda_error );
|
|
cuda_error = cudaDeviceSynchronize();
|
|
|
|
if ( cuda_error != cudaSuccess )
|
|
printf ( "cudaSyn returned %d\n", cuda_error );
|
|
|
|
|
|
int block_size = (params.packet_size / sizeof(int))/INTS_PER_THREAD;
|
|
int block_count = params.m;
|
|
|
|
int num_bytes_shared = params.packet_size;
|
|
|
|
|
|
// for ( int i = 0; i < 1000; i++)
|
|
encode <<< block_count, block_size, num_bytes_shared >>> (src_data_d, parity_data_d,
|
|
pcm_d, params );
|
|
|
|
cuda_error = cudaGetLastError();
|
|
if ( cuda_error != cudaSuccess )
|
|
printf("kernel execution returned %d\n", cuda_error);
|
|
|
|
cudaThreadSynchronize();
|
|
|
|
|
|
cudaMemcpy ( parity, parity_data_d, params.m*params.packet_size, cudaMemcpyDeviceToHost );
|
|
cuda_error = cudaGetLastError();
|
|
if ( cuda_error != cudaSuccess )
|
|
printf("cudaMemcpy from device returned %d\n", cuda_error);
|
|
|
|
cudaFree(src_data_d);
|
|
cudaFree(parity_data_d);
|
|
|
|
}
|
|
|
|
void gpu_decode ( char* received, int* error_vec, char* pc_matrix, struct coding_params params )
|
|
{
|
|
/* int* received_d;
|
|
* int* pcm_d;
|
|
* int* error_vec_d;
|
|
* cudaError_t cuda_error;
|
|
*
|
|
* int k = params.k;
|
|
* int m = params.m;
|
|
* int packet_size = params.packet_size;
|
|
*
|
|
* int **pcm = make_compact_pcm ( pc_matrix, params );
|
|
*
|
|
* //alocate space and copy data to device
|
|
* cuda_error = cudaMalloc ( (void**) &received_d, (k+m)*packet_size);
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMalloc return %d\n", cuda_error );
|
|
*
|
|
* cuda_error = cudaMemcpy ( received_d, received, (k+m)*packet_size, cudaMemcpyHostToDevice );
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMempcy return %d\n", cuda_error );
|
|
*
|
|
* cuda_error = cudaMalloc ( (void**) &pcm_d, m*params.max_row_weight*sizeof(int));
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMalloc return %d\n", cuda_error );
|
|
*
|
|
* cuda_error = cudaMemcpy ( pcm_d, pcm, sizeof(int)*m*params.max_row_weight,
|
|
* cudaMemcpyHostToDevice );
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMempcy return %d\n", cuda_error );
|
|
*
|
|
* cuda_error = cudaMalloc ( (void**) &error_vec_d, params.num_lost*sizeof(int));
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMalloc return %d\n", cuda_error );
|
|
*
|
|
* cuda_error = cudaMemcpy ( pcm_d, pcm, params.num_lost*sizeof(int),
|
|
* cudaMemcpyHostToDevice );
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMempcy pcm return %d\n", cuda_error );
|
|
*
|
|
* int block_size = (packet_size/sizeof(int)) / 4;
|
|
* int block_count = m;
|
|
* int shared_mem_size = (packet_size + sizeof(int))*(params.max_row_weight+2);
|
|
*
|
|
* decode <<< block_count, block_size, shared_mem_size >>> (received_d, error_vec, pcm_d, params );
|
|
*
|
|
* cuda_error = cudaMemcpy ( received, received_d, (k+m)*packet_size, cudaMemcpyDeviceToHost );
|
|
* if ( cuda_error != cudaSuccess )
|
|
* printf ( "cudaMempcy from device return %d\n", cuda_error );
|
|
*
|
|
* cudaFree ( received_d );
|
|
*/
|
|
}
|
|
|
|
/* void make_compact_pcm ( char* pc_matrix, int* pcm, struct coding_params params)
|
|
* {
|
|
* //we need to create a compact representation of sparse pc_matrix
|
|
*
|
|
* int counter = 0;
|
|
* int columns = params.max_row_weight + 2;
|
|
*
|
|
* for ( int i = 0; i < params.m; i++) {
|
|
* for ( int j = 0; j < params.k; j++)
|
|
* if ( pc_matrix[i*params.k + j] )
|
|
* {
|
|
* pcm[i*columns + counter] = j;
|
|
* counter++;
|
|
* }
|
|
* //add indices from staircase matrix
|
|
* pcm[i*columns + counter] = params.k + i;
|
|
* counter++;
|
|
*
|
|
* if ( i > 0 )
|
|
* {
|
|
* pcm[i*columns + counter] = params.k + i - 1;
|
|
* counter++;
|
|
* }
|
|
*
|
|
* if ( counter < columns )
|
|
* for ( int j = counter; j < columns; j++)
|
|
* pcm[i*columns + j] = -1;
|
|
* counter = 0;
|
|
* }
|
|
*
|
|
*
|
|
* for ( int i = 0; i < params.m; i++)
|
|
* {
|
|
* for ( int j = 0; j < columns; j++ )
|
|
* printf ( "%d, ", pcm[i*columns + j] );
|
|
* printf ( "\n" );
|
|
* }
|
|
*
|
|
*
|
|
* }
|
|
*/
|