diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp index 3adf6a120..535e8ea91 100644 --- a/src/video_compress/cmpto_j2k.cpp +++ b/src/video_compress/cmpto_j2k.cpp @@ -3,7 +3,7 @@ * @author Martin Pulec */ /* - * Copyright (c) 2013-2023 CESNET, z. s. p. o. + * Copyright (c) 2013-2024 CESNET * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,6 +42,10 @@ * the GPU is powerful enough due to the fact that CUDA registers the new * buffers which is very slow and because of that the frames cumulate before * the GPU encoder. + * + * @todo + * - check multiple CUDA devices - now the data are always copied to + * the first CUDA device */ #ifdef HAVE_CONFIG_H @@ -87,6 +91,7 @@ /// default max size of state_video_compress_j2k::pool and also value /// for state_video_compress_j2k::max_in_frames #define DEFAULT_POOL_SIZE 4 + /// number of frames that encoder encodes at moment #define DEFAULT_TILE_LIMIT 1 #define DEFAULT_MEM_LIMIT 1000000000LLU @@ -98,28 +103,27 @@ using std::shared_ptr; using std::unique_lock; #ifdef HAVE_CUDA -struct cmpto_j2k_enc_cuda_host_buffer_data_allocator +template +struct cmpto_j2k_enc_cuda_buffer_data_allocator : public video_frame_pool_allocator { void *allocate(size_t size) override { void *ptr = nullptr; if (CUDA_WRAPPER_SUCCESS != - cuda_wrapper_malloc_host(&ptr, size)) { + alloc(&ptr, size)) { MSG(ERROR, "Cannot allocate host buffer: %s\n", cuda_wrapper_last_error_string()); return nullptr; } return ptr; } - void deallocate(void *ptr) override { cuda_wrapper_free_host(ptr); } + void deallocate(void *ptr) override { free(ptr); } [[nodiscard]] video_frame_pool_allocator *clone() const override { - return new cmpto_j2k_enc_cuda_host_buffer_data_allocator(*this); + return new cmpto_j2k_enc_cuda_buffer_data_allocator(*this); } }; -using allocator = cmpto_j2k_enc_cuda_host_buffer_data_allocator; -#else -using allocator = default_data_allocator; #endif struct state_video_compress_j2k { @@ -128,6 +132,7 @@ struct state_video_compress_j2k { struct cmpto_j2k_enc_cfg *enc_settings{}; long long int rate = 0; ///< bitrate in bits per second int mct = -1; // force use of mct - -1 means default + bool pool_in_device_memory = false; ///< frames in pool are on GPU video_frame_pool pool; ///< pool for frames allocated by us but not yet consumed by encoder unsigned int max_in_frames = DEFAULT_POOL_SIZE; ///< max number of frames between push and pop unsigned int in_frames{}; ///< number of currently encoding frames @@ -213,6 +218,24 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc "Setting MCT", NOOP); + s->pool_in_device_memory = false; +#ifdef HAVE_CUDA + if (s->convertFunc == nullptr) { + s->pool_in_device_memory = true; + s->pool = video_frame_pool( + s->max_in_frames, + cmpto_j2k_enc_cuda_buffer_data_allocator< + cuda_wrapper_malloc, cuda_wrapper_free>()); + } else { + s->pool = video_frame_pool( + s->max_in_frames, + cmpto_j2k_enc_cuda_buffer_data_allocator< + cuda_wrapper_malloc_host, cuda_wrapper_free_host>()); + } +#else + s->pool = video_frame_pool(s->max_in_frames, default_data_allocator()); +#endif + s->compressed_desc = desc; s->compressed_desc.color_spec = codec_is_a_rgb(desc.color_spec) ? J2KR : J2K; s->compressed_desc.tile_count = 1; @@ -227,8 +250,17 @@ static shared_ptr get_copy(struct state_video_compress_j2k *s, vide if (s->convertFunc) { s->convertFunc(ret.get(), frame); + } else if (s->pool_in_device_memory) { +#ifdef HAVE_CUDA + cuda_wrapper_memcpy(ret->tiles[0].data, frame->tiles[0].data, + frame->tiles[0].data_len, + CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); +#else + abort(); // must not reach here +#endif } else { - memcpy(ret->tiles[0].data, frame->tiles[0].data, frame->tiles[0].data_len); + memcpy(ret->tiles[0].data, frame->tiles[0].data, + frame->tiles[0].data_len); } return ret; @@ -394,8 +426,6 @@ static struct module * j2k_compress_init(struct module *parent, const char *c_cf } } - s->pool = video_frame_pool(s->max_in_frames, allocator()); - if (quality < 0.0 || quality > 1.0) { LOG(LOG_LEVEL_ERROR) << "[J2K] Quality should be in interval [0-1]!\n"; goto error; @@ -408,6 +438,10 @@ static struct module * j2k_compress_init(struct module *parent, const char *c_cf CHECK_OK(cmpto_j2k_enc_ctx_cfg_add_cuda_device(ctx_cfg, cuda_devices[i], mem_limit, tile_limit), "Setting CUDA device", goto error); } + if (cuda_devices_count > 1) { + MSG(WARNING, "More than one CUDA device is not tested and may " + "not work. Please report...\n"); + } CHECK_OK(cmpto_j2k_enc_ctx_create(ctx_cfg, &s->context), "Context create", goto error); @@ -456,6 +490,14 @@ static void release_cstream(void * custom_data, size_t custom_data_size, const v udata->frame.~shared_ptr(); } +static void +release_cstream_cuda(void *img_custom_data, size_t img_custom_data_size, + int /* device_id */, const void *samples, size_t samples_size) +{ + release_cstream(img_custom_data, img_custom_data_size, samples, + samples_size); +} + #define HANDLE_ERROR_COMPRESS_PUSH \ if (udata != nullptr) { \ udata->frame.~shared_ptr(); \ @@ -508,10 +550,17 @@ static void j2k_compress_push(struct module *state, std::shared_ptr new (&udata->frame) shared_ptr(get_copy(s, tx.get())); vf_store_metadata(tx.get(), udata->metadata); - CHECK_OK(cmpto_j2k_enc_img_set_samples(img, udata->frame->tiles[0].data, - udata->frame->tiles[0].data_len, - release_cstream), - "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH); + if (s->pool_in_device_memory) { + CHECK_OK(cmpto_j2k_enc_img_set_samples_cuda( + img, cuda_devices[0], udata->frame->tiles[0].data, + udata->frame->tiles[0].data_len, release_cstream_cuda), + "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH); + } else { + CHECK_OK(cmpto_j2k_enc_img_set_samples( + img, udata->frame->tiles[0].data, + udata->frame->tiles[0].data_len, release_cstream), + "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH); + } unique_lock lk(s->lock); s->frame_popped.wait(lk, [s]{return s->in_frames < s->max_in_frames;});