From 7155dd92df9336147d72abaf575ac9611dfd28f1 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Thu, 5 Mar 2026 14:35:47 +0100 Subject: [PATCH] to_planar: add decode_to_planar_parallel for parallel processing decode_buffer_func_t funcs --- src/to_planar.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ src/to_planar.h | 18 +++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/to_planar.c b/src/to_planar.c index 948d9888c..e021d03ed 100644 --- a/src/to_planar.c +++ b/src/to_planar.c @@ -48,8 +48,11 @@ #include // for _mm_lddqu_si128 #endif +#include "compat/c23.h" #include "types.h" // for depth, v210, R12L, RGBA, UYVY, Y216 #include "utils/macros.h" // for OPTIMIZED_FOR, ALWAYS_INLINE +#include "utils/misc.h" // for get_cpu_core_count +#include "utils/worker.h" // for task_run_parallel #include "video_codec.h" // for vc_get_linesize /** @@ -459,3 +462,44 @@ r12l_to_rgbp12le(struct to_planar_data d) { r12l_to_gbrpXXle(d, DEPTH12, 0, 1, 2); } + +struct convert_task_data { + decode_buffer_func_t *convert; + struct to_planar_data d; +}; + +static void * +convert_task(void *arg) +{ + struct convert_task_data *d = arg; + d->convert(d->d); + return nullptr; +} + +void +decode_to_planar_parallel(decode_buffer_func_t *dec, struct to_planar_data d, + int src_linesize, int num_threads) +{ + const unsigned cpu_count = num_threads == TO_PLANAR_THREADS_AUTO + ? get_cpu_core_count() + : num_threads; + + struct convert_task_data data[cpu_count]; + for (size_t i = 0; i < cpu_count; ++i) { + unsigned row_height = (d.height / cpu_count) & ~1; // needs to be even + data[i].convert = dec; + data[i].d = d; + data[i].d.in_data = d.in_data + (i * row_height * src_linesize); + + for (unsigned plane = 0; plane < countof(d.out_data); ++plane) { + data[i].d.out_data[plane] = + d.out_data[plane] + + ((i * row_height * d.out_linesize[plane])); + } + if (i == cpu_count - 1) { + row_height = d.height - (row_height * (cpu_count - 1)); + } + data[i].d.height = row_height; + } + task_run_parallel(convert_task, (int) cpu_count, data, sizeof data[0], NULL); +} diff --git a/src/to_planar.h b/src/to_planar.h index 5346f191c..c2c90167f 100644 --- a/src/to_planar.h +++ b/src/to_planar.h @@ -45,7 +45,10 @@ extern "C" { #endif -#define TO_PLANAR_MAX_COMP 4 +enum { + TO_PLANAR_THREADS_AUTO = 0, + TO_PLANAR_MAX_COMP = 4, +}; struct to_planar_data { int width; @@ -69,6 +72,19 @@ decode_buffer_func_t r12l_to_gbrp12le; decode_buffer_func_t r12l_to_gbrp16le; decode_buffer_func_t r12l_to_rgbp12le; +/** + * run the @ref decode_buffer_func_t from packed format in parallel + * @param dec fn to run + * @param src_linesize source linesize (vc_get_linesize(width, in_pixfmt)) + * @param num_threads number of threads or DECODE_TO_THREADS_AUTO to use the + * number of logical cores) + * @note no support for horizontal subsampling for now + * @sa decode_to_planar_parallel + */ +void decode_to_planar_parallel(decode_buffer_func_t *dec, + struct to_planar_data d, int src_linesize, + int num_threads); + #ifdef __cplusplus } #endif