diff --git a/.vscode/settings.json b/.vscode/settings.json index a42bc8f0d..7f162230c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,6 +3,7 @@ "array": "cpp", "string": "cpp", "string_view": "cpp", - "vector": "cpp" + "vector": "cpp", + "__config": "cpp" } } \ No newline at end of file diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp index 298cd2219..1bd42d4b8 100644 --- a/src/video_compress/cmpto_j2k.cpp +++ b/src/video_compress/cmpto_j2k.cpp @@ -67,6 +67,7 @@ #ifdef HAVE_CUDA #include "cuda_wrapper.h" +#include "cuda_wrapper/kernels.hpp" #endif // HAVE_CUDA #include "debug.h" #include "host.h" @@ -150,11 +151,15 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator } }; using cuda_allocator = cmpto_j2k_enc_cuda_buffer_data_allocator; +const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48; #else using cuda_allocator = default_data_allocator; +const cuda_convert_func_t r12l_to_rg48_cuda = nullptr; #endif using cpu_allocator = default_data_allocator; +typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst); + /** * @brief Platforms available for J2K Compression */ @@ -263,9 +268,11 @@ struct state_video_compress_j2k { unsigned int cpu_img_limit = DEFAULT_IMG_LIMIT; // CUDA Parameters - bool pool_in_device_memory = false; - unsigned long long cuda_mem_limit = DEFAULT_CUDA_MEM_LIMIT; - unsigned int cuda_tile_limit = DEFAULT_CUDA_TILE_LIMIT; + bool pool_in_device_memory = false; + cuda_convert_func_t cuda_convert_func = nullptr; + uint8_t *cuda_conv_tmp_buf = nullptr; + unsigned long long cuda_mem_limit = DEFAULT_CUDA_MEM_LIMIT; + unsigned int cuda_tile_limit = DEFAULT_CUDA_TILE_LIMIT; // j2k_compress_platform::NONE by default at initialization j2k_compress_platform platform = j2k_compress_platform::NONE; @@ -326,14 +333,15 @@ static struct { codec_t ug_codec; enum cmpto_sample_format_type cmpto_sf; codec_t convert_codec; - void (*convertFunc)(video_frame *dst, video_frame *src); + /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA + cuda_convert_func_t cuda_convert_func; } codecs[] = { {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr}, {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr}, {RGB, CMPTO_444_U8_P012, VIDEO_CODEC_NONE, nullptr}, {RGBA, CMPTO_444_U8_P012Z, VIDEO_CODEC_NONE, nullptr}, {R10k, CMPTO_444_U10U10U10_MSB32BE_P210, VIDEO_CODEC_NONE, nullptr}, - {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, nullptr}, + {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, r12l_to_rg48_cuda}, }; static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){ @@ -344,11 +352,22 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc if(codec.ug_codec == desc.color_spec){ sample_format = codec.cmpto_sf; s->precompress_codec = codec.convert_codec; + s->cuda_convert_func = codec.cuda_convert_func; found = true; break; } } +#ifdef HAVE_CUDA + cuda_wrapper_set_device((int) cuda_devices[0]); + if (s->cuda_convert_func != nullptr) { + cuda_wrapper_free(s->cuda_conv_tmp_buf); + cuda_wrapper_malloc( + (void **) &s->cuda_conv_tmp_buf, + vc_get_datalen(desc.width, desc.height, desc.color_spec)); + } +#endif + if(!found){ MSG(ERROR, "Failed to find suitable pixel format\n"); return false; @@ -376,17 +395,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc s->pool_in_device_memory = false; #ifdef HAVE_CUDA - if (s->precompress_codec == VC_NONE && cuda_devices_count == 1) { + if (cuda_devices_count == 1) { s->pool_in_device_memory = true; s->pool = std::make_unique( s->max_in_frames, cmpto_j2k_enc_cuda_buffer_data_allocator< cuda_wrapper_malloc, cuda_wrapper_free>()); } else { - if (cuda_devices_count > 1) { - MSG(WARNING, "More than 1 CUDA device will use CPU " - "buffers. Please report...\n"); - } + MSG(WARNING, "More than 1 CUDA device will use CPU " + "buffers. Please report...\n"); s->pool = std::make_unique( s->max_in_frames, cmpto_j2k_enc_cuda_buffer_data_allocator< @@ -405,20 +422,44 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc return true; } +/** + * @brief copies frame from RAM to GPU + * + * Does the pixel format conversion as well if specified. + */ +static void +do_gpu_copy(struct state_video_compress_j2k *s, + std::shared_ptr &ret, video_frame *in_frame) +{ +#ifdef HAVE_CUDA + cuda_wrapper_set_device((int) cuda_devices[0]); + if (s->cuda_convert_func == nullptr) { + assert(s->precompress_codec == VC_NONE); + cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data, + in_frame->tiles[0].data_len, + CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); + return; + } + cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data, + in_frame->tiles[0].data_len, + CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); + s->cuda_convert_func((int) in_frame->tiles[0].width, + (int) in_frame->tiles[0].height, + s->cuda_conv_tmp_buf, ret->tiles[0].data); +#else + (void) s, (void) ret, (void) in_frame; + abort(); // must not reach here +#endif +} + + static shared_ptr get_copy(struct state_video_compress_j2k *s, video_frame *frame){ std::shared_ptr ret = s->pool->get_frame(); - if (s->precompress_codec != VC_NONE) { + if (s->pool_in_device_memory) { + do_gpu_copy(s, ret, frame); + } else if (s->precompress_codec != VC_NONE) { parallel_conv(ret.get(), frame); - } else if (s->pool_in_device_memory) { -#ifdef HAVE_CUDA - cuda_wrapper_set_device((int) cuda_devices[0]); - cuda_wrapper_memcpy(ret->tiles[0].data, frame->tiles[0].data, - frame->tiles[0].data_len, - CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); -#else - abort(); // must not reach here -#endif else { memcpy(ret->tiles[0].data, frame->tiles[0].data, frame->tiles[0].data_len); @@ -913,7 +954,7 @@ static void j2k_compress_push(struct module *state, std::shared_ptr return; } struct video_desc pool_desc = desc; - + if (s->precompress_codec != VC_NONE) { pool_desc.color_spec = s->precompress_codec; } @@ -987,6 +1028,10 @@ static void j2k_compress_done(struct module *mod) cmpto_j2k_enc_cfg_destroy(s->enc_settings); cmpto_j2k_enc_ctx_destroy(s->context); +#ifdef HAVE_CUDA + cuda_wrapper_free(s->cuda_conv_tmp_buf); +#endif + delete s; }