vcomp/cmpto_j2k: run GPU conv as prepreprocess

Set the pixfmt conversion CUDA kernel as cmpto_j2k_enc preprocessor, not run directly. This also eliminates to need to have the conversion kernel if conversion is needed - CPU conversion will be sufficient. Currently not effective, only R12L is converted for which there is the kernel. refer to GH-406
2026-03-20 22:40:18 +00:00 · 2024-09-12 10:12:41 +02:00
parent caacebca37
commit 92565ca2ae
3 changed files with 85 additions and 66 deletions
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -1,6 +1,9 @@
 /**
 * @file   cuda_wrapper/kernels.cu
 * @author Martin Pulec     <pulec@cesnet.cz>
+ *
+ * This file hosts various CUDA kernels. Currently there are only kernels
+ * for cmpto_j2k compression and decompression.
 */
 /*
 * Copyright (c) 2024 CESNET
@@ -298,7 +301,7 @@ int postprocess_rg48_to_r12l(

        MEASURE_KERNEL_DURATION_STOP(stream)

-        return 0;
+        return cudaGetLastError();
 }

 //     ___   ___   ___    __      __     ___   _____  ____  ___
@@ -446,28 +449,42 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
        }
 }

-void
-preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
+int
+preprocess_r12l_to_rg48(void *preprocessor, void *img_custom_data,
+                        size_t img_custom_data_size, int size_x, int size_y,
+                        struct cmpto_j2k_enc_comp_format *comp_formats,
+                        int comp_count, void *input_samples,
+                        size_t input_samples_size, void *output_samples,
+                        size_t output_samples_size, void *vstream)
 {
-        (void) width, (void) height, (void) src, (void) dst;
-        dim3 threads_per_block(256);
-        dim3 blocks((((width + 7) / 8) + 255) / 256, height);
+        (void) preprocessor, (void) img_custom_data,
+            (void) img_custom_data_size, (void) comp_formats, (void) comp_count,
+            (void) input_samples_size, (void) output_samples_size;

-        MEASURE_KERNEL_DURATION_START(0)
-        if (width % 2 == 0) {
-                kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
-                    (uint8_t *) src, (uint8_t *) dst, width);
+        cudaStream_t stream = (cudaStream_t) vstream;
+        dim3 threads_per_block(256);
+        dim3 blocks((((size_x+ 7) / 8) + 255) / 256, size_y);
+
+        MEASURE_KERNEL_DURATION_START(stream)
+        if (size_x % 2 == 0) {
+                kernel_r12l_to_rg48<uint32_t>
+                    <<<blocks, threads_per_block, 0, stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_samples,
+                        size_x);
        } else {
                thread_local bool warn_print;
                if (!warn_print) {
                        fprintf(stderr,
                                "%s: Odd width %d px will use slower kernel!\n",
-                                __func__, width);
+                                __func__, size_x);
                        warn_print = true;
                }
-                kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
-                    (uint8_t *) src, (uint8_t *) dst, width);
+                kernel_r12l_to_rg48<uint16_t>
+                    <<<blocks, threads_per_block, 0, stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_samples,
+                        size_x);
        }
-        MEASURE_KERNEL_DURATION_STOP(0)
-}
+        MEASURE_KERNEL_DURATION_STOP(stream)

+        return cudaGetLastError();
+}
--- a/src/cuda_wrapper/kernels.hpp
+++ b/src/cuda_wrapper/kernels.hpp
@@ -40,8 +40,9 @@

 #include <cstddef>

+// for cmpto_j2k compress and decompress
 struct cmpto_j2k_dec_comp_format;
-
+struct cmpto_j2k_enc_comp_format;
 int postprocess_rg48_to_r12l(
    void * postprocessor,
    void * img_custom_data,
@@ -58,7 +59,19 @@ int postprocess_rg48_to_r12l(
    size_t output_buffer_size,
    void * stream
 );
-
-void preprocess_r12l_to_rg48(int width, int height, void *src, void *dst);
+int preprocess_r12l_to_rg48(
+    void * preprocessor,
+    void * img_custom_data,
+    size_t img_custom_data_size,
+    int size_x,
+    int size_y,
+    struct cmpto_j2k_enc_comp_format * comp_formats,
+    int comp_count,
+    void * input_samples,
+    size_t input_samples_size,
+    void * output_samples,
+    size_t output_samples_size,
+    void *stream
+);

 #endif // defined CUDA_WRAPPER_KERNELS_HPP_1A3F7B57_EE91_4363_8D50_9CDDC60CB74F
--- a/src/video_compress/cmpto_j2k.cpp
+++ b/src/video_compress/cmpto_j2k.cpp
@@ -72,8 +72,9 @@
 #include "utils/misc.h"
 #include "utils/parallel_conv.h"
 #include "utils/video_frame_pool.h"
-#include "video.h"
+#include "video_codec.h"             // for vc_get_linesize, codec_is_a_rgb
 #include "video_compress.h"
+#include "video_frame.h"             // for vf_alloc_desc, vf_free, vf_resto...

 #define MOD_NAME "[Cmpto J2K enc.] "

@@ -129,8 +130,6 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator
 };
 #endif

-typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst);
-
 struct state_video_compress_j2k {
        struct module module_data{};
        struct cmpto_j2k_enc_ctx *context{};
@@ -157,9 +156,6 @@ struct state_video_compress_j2k {
        condition_variable configure_cv;
        bool               configured  = false;
        bool               should_exit = false;
-
-        cuda_convert_func_t cuda_convert_func = nullptr;
-        uint8_t            *cuda_conv_tmp_buf = nullptr;
 };

 // prototypes
@@ -184,18 +180,18 @@ static void parallel_conv(video_frame *dst, video_frame *src){
        }
 }

+const cmpto_j2k_enc_preprocessor_run_callback_cuda r12l_to_rg48_cuda =
 #ifdef HAVE_CUDA
-const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48;
+    preprocess_r12l_to_rg48;
 #else
-const cuda_convert_func_t r12l_to_rg48_cuda = nullptr;
+    nullptr;
 #endif

 static struct {
        codec_t ug_codec;
        enum cmpto_sample_format_type cmpto_sf;
        codec_t convert_codec;
-        /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA
-        cuda_convert_func_t cuda_convert_func;
+        cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func;
 } codecs[] = {
        {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr},
        {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr},
@@ -212,25 +208,14 @@ ADD_TO_PARAM(
    "* " CPU_CONV_PARAM "\n"
    "  Enforce CPU conversion instead of CUDA (applicable to R12L now)\n");
 static void
-set_pool(struct state_video_compress_j2k *s, struct video_desc desc)
+set_pool(struct state_video_compress_j2k *s, bool have_gpu_preprocess)
 {
 #ifdef HAVE_CUDA
-        const bool force_cpu_conv =
-            get_commandline_param(CPU_CONV_PARAM) != nullptr;
        s->pool_in_device_memory = false;
        if (cuda_devices_count > 1) {
                MSG(WARNING, "More than 1 CUDA device will use CPU buffers and "
                             "conversion...\n");
-        } else if (!force_cpu_conv || s->cuda_convert_func == nullptr) {
-                cuda_wrapper_set_device((int) cuda_devices[0]);
-
-                if (s->cuda_convert_func != nullptr) {
-                        cuda_wrapper_malloc(
-                            (void **) &s->cuda_conv_tmp_buf,
-                            vc_get_datalen(desc.width, desc.height, desc.color_spec) +
-                                MAX_PADDING);
-                }
-
+        } else if (s->precompress_codec == VC_NONE || have_gpu_preprocess) {
                s->pool_in_device_memory = true;
                s->pool                  = video_frame_pool(
                    s->max_in_frames,
@@ -238,25 +223,27 @@ set_pool(struct state_video_compress_j2k *s, struct video_desc desc)
                                         cuda_wrapper_malloc, cuda_wrapper_free>());
                return;
        }
-        s->cuda_convert_func = nullptr; // either was 0 or force_cpu_conv
        s->pool = video_frame_pool(
            s->max_in_frames,
            cmpto_j2k_enc_cuda_buffer_data_allocator<cuda_wrapper_malloc_host,
                                                     cuda_wrapper_free_host>());
 #else
+        assert(!have_gpu_preprocess); // if CUDA not found, we shouldn't have
        s->pool = video_frame_pool(s->max_in_frames, default_data_allocator());
 #endif
 }

 static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){
        enum cmpto_sample_format_type sample_format;
+        cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func =
+            nullptr;
        bool found = false;

        for(const auto &codec : codecs){
                if(codec.ug_codec == desc.color_spec){
                        sample_format = codec.cmpto_sf;
                        s->precompress_codec = codec.convert_codec;
-                        s->cuda_convert_func = codec.cuda_convert_func;
+                        cuda_convert_func = codec.cuda_convert_func;
                        found = true;
                        break;
                }
@@ -275,6 +262,10 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                s->configured = false;
        }

+        if (get_commandline_param(CPU_CONV_PARAM) != nullptr) {
+                cuda_convert_func = nullptr;
+        }
+
        struct cmpto_j2k_enc_ctx_cfg *ctx_cfg = nullptr;
        CHECK_OK(cmpto_j2k_enc_ctx_cfg_create(&ctx_cfg),
                 "Context configuration create", return false);
@@ -284,6 +275,11 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                        ctx_cfg, cuda_devices[i], s->mem_limit, s->tile_limit),
                    "Setting CUDA device", return false);
        }
+        if (cuda_convert_func != nullptr) {
+                CHECK_OK(cmpto_j2k_enc_ctx_cfg_set_preprocessor_cuda(
+                             ctx_cfg, nullptr, nullptr, cuda_convert_func),
+                         "Setting CUDA preprocess", return false);
+        }

        CHECK_OK(cmpto_j2k_enc_ctx_create(ctx_cfg, &s->context),
                 "Context create", return false);
@@ -321,7 +317,7 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                        "Setting MCT",
                        NOOP);

-        set_pool(s, desc);
+        set_pool(s, cuda_convert_func != nullptr);

        s->compressed_desc = desc;
        s->compressed_desc.color_spec = codec_is_a_rgb(desc.color_spec) ? J2KR : J2K;
@@ -341,26 +337,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
 * Does the pixel format conversion as well if specified.
 */
 static void
-do_gpu_copy(struct state_video_compress_j2k *s,
-             std::shared_ptr<video_frame> &ret, video_frame *in_frame)
+do_gpu_copy(std::shared_ptr<video_frame> &ret, video_frame *in_frame)
 {
 #ifdef HAVE_CUDA
        cuda_wrapper_set_device((int) cuda_devices[0]);
-        if (s->cuda_convert_func == nullptr) {
-                assert(s->precompress_codec == VC_NONE);
-                cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
-                                    in_frame->tiles[0].data_len,
-                                    CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
-                return;
-        }
-        cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data,
+        cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
                            in_frame->tiles[0].data_len,
                            CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
-        s->cuda_convert_func((int) in_frame->tiles[0].width,
-                             (int) in_frame->tiles[0].height,
-                             s->cuda_conv_tmp_buf, ret->tiles[0].data);
 #else
-        (void) s, (void) ret, (void) in_frame;
+        (void) ret, (void) in_frame;
        abort(); // must not reach here
 #endif
 }
@@ -369,7 +354,7 @@ static shared_ptr<video_frame> get_copy(struct state_video_compress_j2k *s, vide
        std::shared_ptr<video_frame> ret = s->pool.get_frame();

        if (s->pool_in_device_memory) {
-                do_gpu_copy(s, ret, frame);
+                do_gpu_copy(ret, frame);
        } else if (s->precompress_codec != VC_NONE) {
                parallel_conv(ret.get(), frame);
        } else {
@@ -615,7 +600,8 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
                        return;
                }
                struct video_desc pool_desc = desc;
-                if (s->precompress_codec != VC_NONE) {
+                if (s->precompress_codec != VC_NONE &&
+                    !s->pool_in_device_memory) {
                        pool_desc.color_spec = s->precompress_codec;
                }
                s->pool.reconfigure(
@@ -646,9 +632,17 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
        vf_store_metadata(tx.get(), udata->metadata);

        if (s->pool_in_device_memory) {
+                // cmpto_j2k_enc requires the size after postprocess, which
+                // doesn't equeal the IN frame data_len for R12L
+                const codec_t device_codec = s->precompress_codec == VC_NONE
+                                           ? udata->frame->color_spec
+                                           : s->precompress_codec;
+                const size_t  data_len =
+                    vc_get_datalen(udata->frame->tiles[0].width,
+                                   udata->frame->tiles[0].height, device_codec);
                CHECK_OK(cmpto_j2k_enc_img_set_samples_cuda(
                             img, cuda_devices[0], udata->frame->tiles[0].data,
-                             udata->frame->tiles[0].data_len, release_cstream_cuda),
+                             data_len, release_cstream_cuda),
                         "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH);
        } else {
                CHECK_OK(cmpto_j2k_enc_img_set_samples(
@@ -694,11 +688,6 @@ cleanup_common(struct state_video_compress_j2k *s)
                cmpto_j2k_enc_ctx_destroy(s->context);
        }
        s->context = nullptr;
-
-#ifdef HAVE_CUDA
-        cuda_wrapper_free(s->cuda_conv_tmp_buf);
-        s->cuda_conv_tmp_buf = nullptr;
-#endif
 }

 static compress_module_info get_cmpto_j2k_module_info(){