diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 43088a38f..c9f3c8205 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -176,20 +176,20 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x, unsigned size_y) #ifdef DEBUG #include -#define MEASURE_KERNEL_DURATION_START \ +#define MEASURE_KERNEL_DURATION_START(stream) \ cudaEvent_t t0, t1; \ cudaEventCreate(&t0); \ cudaEventCreate(&t1); \ cudaEventRecord(t0, stream); -#define MEASURE_KERNEL_DURATION_STOP \ +#define MEASURE_KERNEL_DURATION_STOP(stream) \ cudaEventRecord(t1, stream); \ cudaEventSynchronize(t1); \ float elapsedTime = NAN; \ cudaEventElapsedTime(&elapsedTime, t0, t1); \ printf("elapsed time: %f\n", elapsedTime); #else -#define MEASURE_KERNEL_DURATION_START -#define MEASURE_KERNEL_DURATION_STOP +#define MEASURE_KERNEL_DURATION_START(stream) +#define MEASURE_KERNEL_DURATION_STOP(stream) #endif /** @@ -215,14 +215,128 @@ int postprocess_rg48_to_r12l( dim3 threads_per_block(256); dim3 blocks((((size_x + 7) / 8) + 255) / 256, size_y); - MEASURE_KERNEL_DURATION_START + MEASURE_KERNEL_DURATION_START(stream) kernel_rg48_to_r12l<<>>( (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x, size_y); - MEASURE_KERNEL_DURATION_STOP + MEASURE_KERNEL_DURATION_STOP(stream) return 0; } + +/// adapted variant of @ref vc_copylineR12LtoRG48 +__global__ void +kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x, unsigned size_y) +{ + unsigned position_x = threadIdx.x + blockIdx.x * blockDim.x; + unsigned position_y = threadIdx.y + blockIdx.y * blockDim.y; + if (position_x > (size_x + 7) / 8) { + return; + } + // drop last block if not aligned (prevent OOB read from input) + if (position_y == size_y - 1 && position_x > size_x / 8) { + return; + } + uint8_t *dst = out + 2 * (position_y * 3 * size_x + position_x * 3 * 8); + uint8_t *src = + in + (position_y * ((size_x + 7) / 8) + position_x) * 36; + + // 0 + // R + *dst++ = src[0] << 4; + *dst++ = (src[1] << 4) | (src[0] >> 4); + // G + *dst++ = src[1] & 0xF0; + *dst++ = src[2]; + // B + *dst++ = src[3] << 4; + *dst++ = (src[4 + 0] << 4) | (src[3] >> 4); + + // 1 + *dst++ = src[4 + 0] & 0xF0; + *dst++ = src[4 + 1]; + + *dst++ = src[4 + 2] << 4; + *dst++ = (src[4 + 3] << 4) | (src[4 + 2] >> 4); + + *dst++ = src[4 + 3] & 0xF0; + *dst++ = src[8 + 0]; + + // 2 + *dst++ = src[8 + 1] << 4; + *dst++ = (src[8 + 2] << 4) | (src[8 + 1] >> 4); + + *dst++ = src[8 + 2] & 0xF0; + *dst++ = src[8 + 3]; + + *dst++ = src[12 + 0] << 4; + *dst++ = (src[12 + 1] << 4) | (src[12 + 0] >> 4); + + // 3 + *dst++ = src[12 + 1] & 0xF0; + *dst++ = src[12 + 2]; + + *dst++ = src[12 + 3] << 4; + *dst++ = (src[16 + 0] << 4) | (src[12 + 3] >> 4); + + *dst++ = src[16 + 0] & 0xF0; + *dst++ = src[16 + 1]; + + // 4 + *dst++ = src[16 + 2] << 4; + *dst++ = (src[16 + 3] << 4) | (src[16 + 2] >> 4); + + *dst++ = src[16 + 3] & 0xF0; + *dst++ = src[20 + 0]; + + *dst++ = src[20 + 1] << 4; + *dst++ = (src[20 + 2] << 4) | (src[20 + 1] >> 4); + + // 5 + *dst++ = src[20 + 2] & 0xF0; + *dst++ = src[20 + 3]; + + *dst++ = src[24 + 0] << 4; + *dst++ = (src[24 + 1] << 4) | (src[24 + 0] >> 4); + + *dst++ = src[24 + 1] & 0xF0; + *dst++ = src[24 + 2]; + + // 6 + *dst++ = src[24 + 3] << 4; + *dst++ = (src[28 + 0] << 4) | (src[24 + 3] >> 4); + + *dst++ = src[28 + 0] & 0xF0; + *dst++ = src[28 + 1]; + + *dst++ = src[28 + 2] << 4; + *dst++ = (src[28 + 3] << 4) | (src[28 + 2] >> 4); + + // 7 + *dst++ = src[28 + 3] & 0xF0; + *dst++ = src[32 + 0]; + + *dst++ = src[32 + 1] << 4; + *dst++ = (src[32 + 2] << 4) | (src[32 + 1] >> 4); + + *dst++ = src[32 + 2] & 0xF0; + *dst++ = src[32 + 3]; +} + +void +preprocess_r12l_to_rg48(int width, int height, void *src, void *dst) +{ + (void) width, (void) height, (void) src, (void) dst; + dim3 threads_per_block(256); + dim3 blocks((((width + 7) / 8) + 255) / 256, height); + + MEASURE_KERNEL_DURATION_START(0) + kernel_r12l_to_rg48<<>>( + (uint8_t *) src, (uint8_t *) dst, width, + height); + MEASURE_KERNEL_DURATION_STOP(0) +} + diff --git a/src/cuda_wrapper/kernels.hpp b/src/cuda_wrapper/kernels.hpp index 159d65c98..b23aaab6b 100644 --- a/src/cuda_wrapper/kernels.hpp +++ b/src/cuda_wrapper/kernels.hpp @@ -59,4 +59,6 @@ int postprocess_rg48_to_r12l( void * stream ); +void preprocess_r12l_to_rg48(int width, int height, void *src, void *dst); + #endif // defined CUDA_WRAPPER_KERNELS_HPP_1A3F7B57_EE91_4363_8D50_9CDDC60CB74F diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp index aba5bb2d2..28cfcc48d 100644 --- a/src/video_compress/cmpto_j2k.cpp +++ b/src/video_compress/cmpto_j2k.cpp @@ -55,13 +55,13 @@ #include #include #include -#include #include #include #ifdef HAVE_CUDA #include "cuda_wrapper.h" +#include "cuda_wrapper/kernels.hpp" #endif #include "debug.h" #include "host.h" @@ -129,6 +129,8 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator }; #endif +typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst); + struct state_video_compress_j2k { struct module module_data{}; struct cmpto_j2k_enc_ctx *context{}; @@ -144,6 +146,9 @@ struct state_video_compress_j2k { video_desc saved_desc{}; codec_t precompress_codec = VC_NONE; video_desc compressed_desc{}; + + cuda_convert_func_t cuda_convert_func = nullptr; + uint8_t *cuda_conv_tmp_buf = nullptr; }; static void j2k_compressed_frame_dispose(struct video_frame *frame); @@ -161,18 +166,25 @@ static void parallel_conv(video_frame *dst, video_frame *src){ decoder, 0); } +#ifdef HAVE_CUDA +const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48; +#else +const cuda_convert_func_t r12l_to_rg48_cuda = nullptr; +#endif + static struct { codec_t ug_codec; enum cmpto_sample_format_type cmpto_sf; codec_t convert_codec; - void (*convertFunc)(video_frame *dst, video_frame *src); + /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA + cuda_convert_func_t cuda_convert_func; } codecs[] = { {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr}, {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr}, {RGB, CMPTO_444_U8_P012, VIDEO_CODEC_NONE, nullptr}, {RGBA, CMPTO_444_U8_P012Z, VIDEO_CODEC_NONE, nullptr}, {R10k, CMPTO_444_U10U10U10_MSB32BE_P210, VIDEO_CODEC_NONE, nullptr}, - {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, nullptr}, + {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, r12l_to_rg48_cuda}, }; static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){ @@ -183,11 +195,22 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc if(codec.ug_codec == desc.color_spec){ sample_format = codec.cmpto_sf; s->precompress_codec = codec.convert_codec; + s->cuda_convert_func = codec.cuda_convert_func; found = true; break; } } +#ifdef HAVE_CUDA + cuda_wrapper_set_device((int) cuda_devices[0]); + if (s->cuda_convert_func != nullptr) { + cuda_wrapper_free(s->cuda_conv_tmp_buf); + cuda_wrapper_malloc( + (void **) &s->cuda_conv_tmp_buf, + vc_get_datalen(desc.width, desc.height, desc.color_spec)); + } +#endif + if(!found){ log_msg(LOG_LEVEL_ERROR, "[J2K] Failed to find suitable pixel format\n"); return false; @@ -215,17 +238,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc s->pool_in_device_memory = false; #ifdef HAVE_CUDA - if (s->precompress_codec == VC_NONE && cuda_devices_count == 1) { + if (cuda_devices_count == 1) { s->pool_in_device_memory = true; s->pool = video_frame_pool( s->max_in_frames, cmpto_j2k_enc_cuda_buffer_data_allocator< cuda_wrapper_malloc, cuda_wrapper_free>()); } else { - if (cuda_devices_count > 1) { - MSG(WARNING, "More than 1 CUDA device will use CPU " - "buffers. Please report...\n"); - } + MSG(WARNING, "More than 1 CUDA device will use CPU buffers. " + "Please report...\n"); s->pool = video_frame_pool( s->max_in_frames, cmpto_j2k_enc_cuda_buffer_data_allocator< @@ -244,20 +265,43 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc return true; } +/** + * @brief copies frame from RAM to GPU + * + * Does the pixel format conversion as well if specified. + */ +static void +do_gpu_copy(struct state_video_compress_j2k *s, + std::shared_ptr &ret, video_frame *in_frame) +{ +#ifdef HAVE_CUDA + cuda_wrapper_set_device((int) cuda_devices[0]); + if (s->cuda_convert_func == nullptr) { + assert(s->precompress_codec == VC_NONE); + cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data, + in_frame->tiles[0].data_len, + CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); + return; + } + cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data, + in_frame->tiles[0].data_len, + CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); + s->cuda_convert_func((int) in_frame->tiles[0].width, + (int) in_frame->tiles[0].height, + s->cuda_conv_tmp_buf, ret->tiles[0].data); +#else + (void) s, (void) ret, (void) in_frame; + abort(); // must not reach here +#endif +} + static shared_ptr get_copy(struct state_video_compress_j2k *s, video_frame *frame){ std::shared_ptr ret = s->pool.get_frame(); - if (s->precompress_codec != VC_NONE) { + if (s->pool_in_device_memory) { + do_gpu_copy(s, ret, frame); + } else if (s->precompress_codec != VC_NONE) { parallel_conv(ret.get(), frame); - } else if (s->pool_in_device_memory) { -#ifdef HAVE_CUDA - cuda_wrapper_set_device((int) cuda_devices[0]); - cuda_wrapper_memcpy(ret->tiles[0].data, frame->tiles[0].data, - frame->tiles[0].data_len, - CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); -#else - abort(); // must not reach here -#endif } else { memcpy(ret->tiles[0].data, frame->tiles[0].data, frame->tiles[0].data_len); @@ -589,6 +633,10 @@ static void j2k_compress_done(struct module *mod) cmpto_j2k_enc_cfg_destroy(s->enc_settings); cmpto_j2k_enc_ctx_destroy(s->context); +#ifdef HAVE_CUDA + cuda_wrapper_free(s->cuda_conv_tmp_buf); +#endif + delete s; }