Implement 930abe5 from master

Implement [930abe5](930abe5325) from master
2026-04-07 02:05:11 +00:00 · 2024-09-06 17:45:10 -04:00
parent bf7e84dde7
commit 2bd4c094cf
2 changed files with 68 additions and 22 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,6 +3,7 @@
        "array": "cpp",
        "string": "cpp",
        "string_view": "cpp",
-        "vector": "cpp"
+        "vector": "cpp",
+        "__config": "cpp"
    }
 }
--- a/src/video_compress/cmpto_j2k.cpp
+++ b/src/video_compress/cmpto_j2k.cpp
@@ -67,6 +67,7 @@

 #ifdef HAVE_CUDA
 #include "cuda_wrapper.h"
+#include "cuda_wrapper/kernels.hpp"
 #endif // HAVE_CUDA
 #include "debug.h"
 #include "host.h"
@@ -150,11 +151,15 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator
        }
 };
 using cuda_allocator = cmpto_j2k_enc_cuda_buffer_data_allocator;
+const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48;
 #else
 using cuda_allocator = default_data_allocator;
+const cuda_convert_func_t r12l_to_rg48_cuda = nullptr;
 #endif
 using cpu_allocator  = default_data_allocator;

+typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst);
+
 /** 
 * @brief Platforms available for J2K Compression
 */
@@ -263,9 +268,11 @@ struct state_video_compress_j2k {
        unsigned int  cpu_img_limit        = DEFAULT_IMG_LIMIT;

        // CUDA Parameters
-        bool         pool_in_device_memory = false;
-        unsigned long long cuda_mem_limit  = DEFAULT_CUDA_MEM_LIMIT;
-        unsigned int       cuda_tile_limit = DEFAULT_CUDA_TILE_LIMIT;
+        bool         pool_in_device_memory    = false;
+        cuda_convert_func_t cuda_convert_func = nullptr;
+        uint8_t            *cuda_conv_tmp_buf = nullptr;
+        unsigned long long cuda_mem_limit     = DEFAULT_CUDA_MEM_LIMIT;
+        unsigned int       cuda_tile_limit    = DEFAULT_CUDA_TILE_LIMIT;

        // j2k_compress_platform::NONE by default at initialization
        j2k_compress_platform platform     = j2k_compress_platform::NONE;
@@ -326,14 +333,15 @@ static struct {
        codec_t ug_codec;
        enum cmpto_sample_format_type cmpto_sf;
        codec_t convert_codec;
-        void (*convertFunc)(video_frame *dst, video_frame *src);
+        /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA
+        cuda_convert_func_t cuda_convert_func;
 } codecs[] = {
        {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr},
        {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr},
        {RGB, CMPTO_444_U8_P012, VIDEO_CODEC_NONE, nullptr},
        {RGBA, CMPTO_444_U8_P012Z, VIDEO_CODEC_NONE, nullptr},
        {R10k, CMPTO_444_U10U10U10_MSB32BE_P210, VIDEO_CODEC_NONE, nullptr},
-        {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, nullptr},
+        {R12L, CMPTO_444_U12_MSB16LE_P012, RG48, r12l_to_rg48_cuda},
 };

 static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){
@@ -344,11 +352,22 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                if(codec.ug_codec == desc.color_spec){
                        sample_format = codec.cmpto_sf;
                        s->precompress_codec = codec.convert_codec;
+                        s->cuda_convert_func = codec.cuda_convert_func;
                        found = true;
                        break;
                }
        }

+#ifdef HAVE_CUDA
+        cuda_wrapper_set_device((int) cuda_devices[0]);
+        if (s->cuda_convert_func != nullptr) {
+                cuda_wrapper_free(s->cuda_conv_tmp_buf);
+                cuda_wrapper_malloc(
+                    (void **) &s->cuda_conv_tmp_buf,
+                    vc_get_datalen(desc.width, desc.height, desc.color_spec));
+        }
+#endif
+
        if(!found){
                MSG(ERROR, "Failed to find suitable pixel format\n");
                return false;
@@ -376,17 +395,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc

        s->pool_in_device_memory = false;
 #ifdef HAVE_CUDA
-        if (s->precompress_codec == VC_NONE && cuda_devices_count == 1) {
+        if (cuda_devices_count == 1) {
                s->pool_in_device_memory = true;
                s->pool = std::make_unique<video_frame_pool>(
                    s->max_in_frames,
                    cmpto_j2k_enc_cuda_buffer_data_allocator<
                        cuda_wrapper_malloc, cuda_wrapper_free>());
        } else {
-                if (cuda_devices_count > 1) {
-                        MSG(WARNING, "More than 1 CUDA device will use CPU "
-                                     "buffers. Please report...\n");
-                }
+                MSG(WARNING, "More than 1 CUDA device will use CPU "
+                                "buffers. Please report...\n");
                s->pool = std::make_unique<video_frame_pool>(
                    s->max_in_frames,
                    cmpto_j2k_enc_cuda_buffer_data_allocator<
@@ -405,20 +422,44 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
        return true;
 }

+/**
+ * @brief copies frame from RAM to GPU
+ *
+ * Does the pixel format conversion as well if specified.
+ */
+static void
+do_gpu_copy(struct state_video_compress_j2k *s,
+             std::shared_ptr<video_frame> &ret, video_frame *in_frame)
+{
+#ifdef HAVE_CUDA
+        cuda_wrapper_set_device((int) cuda_devices[0]);
+        if (s->cuda_convert_func == nullptr) {
+                assert(s->precompress_codec == VC_NONE);
+                cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
+                                    in_frame->tiles[0].data_len,
+                                    CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
+                return;
+        }
+        cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data,
+                            in_frame->tiles[0].data_len,
+                            CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
+        s->cuda_convert_func((int) in_frame->tiles[0].width,
+                             (int) in_frame->tiles[0].height,
+                             s->cuda_conv_tmp_buf, ret->tiles[0].data);
+#else
+        (void) s, (void) ret, (void) in_frame;
+        abort(); // must not reach here
+#endif
+}
+
+
 static shared_ptr<video_frame> get_copy(struct state_video_compress_j2k *s, video_frame *frame){
        std::shared_ptr<video_frame> ret = s->pool->get_frame();

-      if (s->precompress_codec != VC_NONE) {
+        if (s->pool_in_device_memory) {
+                do_gpu_copy(s, ret, frame);
+        } else if (s->precompress_codec != VC_NONE) {
                parallel_conv(ret.get(), frame);
-        } else if (s->pool_in_device_memory) {
-#ifdef HAVE_CUDA
-                cuda_wrapper_set_device((int) cuda_devices[0]);
-                cuda_wrapper_memcpy(ret->tiles[0].data, frame->tiles[0].data,
-                                    frame->tiles[0].data_len,
-                                    CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
-#else
-                abort(); // must not reach here
-#endif
        else {
                memcpy(ret->tiles[0].data, frame->tiles[0].data,
                       frame->tiles[0].data_len);
@@ -913,7 +954,7 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
                        return;
                }
                struct video_desc pool_desc = desc;
-                
+
                if (s->precompress_codec != VC_NONE) {
                        pool_desc.color_spec = s->precompress_codec;
                }
@@ -987,6 +1028,10 @@ static void j2k_compress_done(struct module *mod)
        cmpto_j2k_enc_cfg_destroy(s->enc_settings);
        cmpto_j2k_enc_ctx_destroy(s->context);

+#ifdef HAVE_CUDA
+        cuda_wrapper_free(s->cuda_conv_tmp_buf);
+#endif
+
        delete s;
 }