From fa764db2d6960ed8589058c0926eaa661ce018b8 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Fri, 14 Feb 2025 12:36:58 +0100
Subject: [PATCH] share some more convs between sdl3, to_lavc, testc

1. to SDL3 nv12 (not supported before)

2. to_lavc - use rgba_to_bgra - the vc_copyline version was not keeping alpha
(if ever used - this may cause regressions, unfortunately, if alpha is
assumed to be 100%, because the orig version set it always)

3. uyvy_to_i420 use by testcard and to_lavc

uyvy_to_i420: do not write out of bounds

If width % 2 = 1, 1 extra luma on first line of pair will override the
first luma on the second.
---
 src/libavcodec/to_lavc_vid_conv.c   | 156 +++------------------------
 src/pixfmt_conv.c                   | 160 +++++++++++++++++++++++++++-
 src/pixfmt_conv.h                   |  18 ++--
 src/video_capture/testcard_common.c |  51 +++------
 src/video_display/sdl3.c            | 130 +++++++++++-----------
 5 files changed, 258 insertions(+), 257 deletions(-)

diff --git a/src/libavcodec/to_lavc_vid_conv.c b/src/libavcodec/to_lavc_vid_conv.c
index 9dbd78025..8b4773329 100644
--- a/src/libavcodec/to_lavc_vid_conv.c
+++ b/src/libavcodec/to_lavc_vid_conv.c
@@ -79,51 +79,8 @@
 
 static void uyvy_to_yuv420p(AVFrame * __restrict out_frame, const unsigned char * __restrict in_data, int width, int height)
 {
-        int y;
-        for (y = 0; y < height - 1; y += 2) {
-                /*  every even row */
-                const unsigned char *src = in_data + y * (((width + 1) & ~1) * 2);
-                /*  every odd row */
-                const unsigned char *src2 = in_data + (y + 1) * (((width + 1) & ~1) * 2);
-                unsigned char *dst_y = out_frame->data[0] + out_frame->linesize[0] * y;
-                unsigned char *dst_y2 = out_frame->data[0] + out_frame->linesize[0] * (y + 1);
-                unsigned char *dst_cb = out_frame->data[1] + out_frame->linesize[1] * (y / 2);
-                unsigned char *dst_cr = out_frame->data[2] + out_frame->linesize[2] * (y / 2);
-
-                int x;
-                OPTIMIZED_FOR (x = 0; x < width - 1; x += 2) {
-                        *dst_cb++ = (*src++ + *src2++) / 2;
-                        *dst_y++ = *src++;
-                        *dst_y2++ = *src2++;
-                        *dst_cr++ = (*src++ + *src2++) / 2;
-                        *dst_y++ = *src++;
-                        *dst_y2++ = *src2++;
-                }
-                if (x < width) {
-                        *dst_cb++ = (*src++ + *src2++) / 2;
-                        *dst_y++ = *src++;
-                        *dst_y2++ = *src2++;
-                        *dst_cr++ = (*src++ + *src2++) / 2;
-                }
-        }
-        if (y < height) {
-                const unsigned char *src = in_data + y * (((width + 1) & ~1) * 2);
-                unsigned char *dst_y = out_frame->data[0] + out_frame->linesize[0] * y;
-                unsigned char *dst_cb = out_frame->data[1] + out_frame->linesize[1] * (y / 2);
-                unsigned char *dst_cr = out_frame->data[2] + out_frame->linesize[2] * (y / 2);
-                int x;
-                OPTIMIZED_FOR (x = 0; x < width - 1; x += 2) {
-                        *dst_cb++ = *src++;
-                        *dst_y++ = *src++;
-                        *dst_cr++ = *src++;
-                        *dst_y++ = *src++;
-                }
-                if (x < width) {
-                        *dst_cb++ = *src++;
-                        *dst_y++ = *src++;
-                        *dst_cr++ = *src++;
-                }
-        }
+        uyvy_to_i420(out_frame->data, out_frame->linesize, in_data, width,
+                     height);
 }
 
 static void uyvy_to_yuv422p(AVFrame * __restrict out_frame, const unsigned char * __restrict src, int width, int height)
@@ -181,86 +138,10 @@ static void uyvy_to_yuv444p(AVFrame * __restrict out_frame, const unsigned char
         }
 }
 
-static void uyvy_to_nv12(AVFrame * __restrict out_frame, const unsigned char * __restrict in_data, int width, int height)
+static void to_lavc_uyvy_to_nv12(AVFrame * __restrict out_frame, const unsigned char * __restrict in_data, int width, int height)
 {
-        for(int y = 0; y < height; y += 2) {
-                /*  every even row */
-                const unsigned char *src = in_data + y * (width * 2);
-                /*  every odd row */
-                const unsigned char *src2 = in_data + (y + 1) * (width * 2);
-                unsigned char *dst_y = out_frame->data[0] + out_frame->linesize[0] * y;
-                unsigned char *dst_y2 = out_frame->data[0] + out_frame->linesize[0] * (y + 1);
-                unsigned char *dst_cbcr = out_frame->data[1] + out_frame->linesize[1] * y / 2;
-
-                int x = 0;
-#ifdef __SSE3__
-                __m128i yuv;
-                __m128i yuv2;
-                __m128i y1;
-                __m128i y2;
-                __m128i y3;
-                __m128i y4;
-                __m128i uv;
-                __m128i uv2;
-                __m128i uv3;
-                __m128i uv4;
-                __m128i ymask = _mm_set1_epi32(0xFF00FF00);
-                __m128i dsty;
-                __m128i dsty2;
-                __m128i dstuv;
-
-                for (; x < (width - 15); x += 16){
-                        yuv = _mm_lddqu_si128((__m128i const*)(const void *) src);
-                        yuv2 = _mm_lddqu_si128((__m128i const*)(const void *) src2);
-                        src += 16;
-                        src2 += 16;
-
-                        y1 = _mm_and_si128(ymask, yuv);
-                        y1 = _mm_bsrli_si128(y1, 1);
-                        y2 = _mm_and_si128(ymask, yuv2);
-                        y2 = _mm_bsrli_si128(y2, 1);
-
-                        uv = _mm_andnot_si128(ymask, yuv);
-                        uv2 = _mm_andnot_si128(ymask, yuv2);
-
-                        uv = _mm_avg_epu8(uv, uv2);
-
-                        yuv = _mm_lddqu_si128((__m128i const*)(const void *) src);
-                        yuv2 = _mm_lddqu_si128((__m128i const*)(const void *) src2);
-                        src += 16;
-                        src2 += 16;
-
-                        y3 = _mm_and_si128(ymask, yuv);
-                        y3 = _mm_bsrli_si128(y3, 1);
-                        y4 = _mm_and_si128(ymask, yuv2);
-                        y4 = _mm_bsrli_si128(y4, 1);
-
-                        uv3 = _mm_andnot_si128(ymask, yuv);
-                        uv4 = _mm_andnot_si128(ymask, yuv2);
-
-                        uv3 = _mm_avg_epu8(uv3, uv4);
-
-                        dsty = _mm_packus_epi16(y1, y3);
-                        dsty2 = _mm_packus_epi16(y2, y4);
-                        dstuv = _mm_packus_epi16(uv, uv3);
-                        _mm_storeu_si128((__m128i *)(void *) dst_y, dsty);
-                        _mm_storeu_si128((__m128i *)(void *) dst_y2, dsty2);
-                        _mm_storeu_si128((__m128i *)(void *) dst_cbcr, dstuv);
-                        dst_y += 16;
-                        dst_y2 += 16;
-                        dst_cbcr += 16;
-                }
-#endif
-
-                OPTIMIZED_FOR (; x < width - 1; x += 2) {
-                        *dst_cbcr++ = (*src++ + *src2++) / 2;
-                        *dst_y++ = *src++;
-                        *dst_y2++ = *src2++;
-                        *dst_cbcr++ = (*src++ + *src2++) / 2;
-                        *dst_y++ = *src++;
-                        *dst_y2++ = *src2++;
-                }
-        }
+        uyvy_to_nv12(out_frame->data, out_frame->linesize, in_data, width,
+                     height);
 }
 
 static void v210_to_yuv420p10le(AVFrame * __restrict out_frame, const unsigned char * __restrict in_data, int width, int height)
@@ -547,9 +428,8 @@ to_lavc_v210_to_p010le(AVFrame *__restrict out_frame,
                        const unsigned char *__restrict in_data, int width,
                        int height)
 {
-        char *out_data[2] = { (char *) out_frame->data[0], (char *) out_frame->data[1]};
-        v210_to_p010le(out_data, out_frame->linesize, (const char *) in_data,
-                       width, height);
+        v210_to_p010le(out_frame->data, out_frame->linesize, in_data, width,
+                       height);
 }
 
 static void
@@ -557,9 +437,8 @@ to_lavc_y216_to_p010le(AVFrame *__restrict out_frame,
                        const unsigned char *__restrict in_data, int width,
                        int height)
 {
-        char *out_data[2] = { (char *) out_frame->data[0], (char *) out_frame->data[1]};
-        y216_to_p010le(out_data, out_frame->linesize, (const char *) in_data,
-                       width, height);
+        y216_to_p010le(out_frame->data, out_frame->linesize, in_data, width,
+                       height);
 }
 
 #if P210_PRESENT
@@ -1092,14 +971,13 @@ static void rgba_to_gbrp(AVFrame * __restrict out_frame, const unsigned char * _
         rgb_rgba_to_gbrp(out_frame, in_data, width, height, 4);
 }
 
-static void rgba_to_bgra(AVFrame * __restrict out_frame, const unsigned char * __restrict in_data, int width, int height)
+static void
+to_lavc_rgba_to_bgra(AVFrame *__restrict out_frame,
+                     const unsigned char *__restrict in_data, int width,
+                     int height)
 {
-        int linesize = vc_get_linesize(width, RGBA);
-        for (ptrdiff_t y = 0; y < height; ++y) {
-                const unsigned char *src = in_data + y * linesize;
-                unsigned char *dst = out_frame->data[0] + out_frame->linesize[0] * y;
-                vc_copylineRGBA(dst, src, linesize, 16, 8, 0);
-        }
+        rgba_to_bgra(out_frame->data, out_frame->linesize, in_data, width,
+                     height);
 }
 
 #if defined __GNUC__
@@ -1311,7 +1189,7 @@ static const struct uv_to_av_conversion *get_uv_to_av_conversions() {
 #endif
                 { UYVY, AV_PIX_FMT_YUV420P,     uyvy_to_yuv420p },
                 { UYVY, AV_PIX_FMT_YUVJ420P,    uyvy_to_yuv420p },
-                { UYVY, AV_PIX_FMT_NV12,        uyvy_to_nv12 },
+                { UYVY, AV_PIX_FMT_NV12,        to_lavc_uyvy_to_nv12 },
                 { UYVY, AV_PIX_FMT_YUV444P,     uyvy_to_yuv444p },
                 { UYVY, AV_PIX_FMT_YUVJ444P,    uyvy_to_yuv444p },
                 { Y216, AV_PIX_FMT_YUV422P10LE, y216_to_yuv422p10le },
@@ -1321,7 +1199,7 @@ static const struct uv_to_av_conversion *get_uv_to_av_conversions() {
                 { RGB, AV_PIX_FMT_GBRP,         rgb_to_gbrp },
                 { RGB, AV_PIX_FMT_YUV444P,      rgb_to_yuv444p },
                 { RGBA, AV_PIX_FMT_GBRP,        rgba_to_gbrp },
-                { RGBA, AV_PIX_FMT_BGRA,        rgba_to_bgra },
+                { RGBA, AV_PIX_FMT_BGRA,        to_lavc_rgba_to_bgra },
                 { R10k, AV_PIX_FMT_BGR0,        r10k_to_bgr0 },
                 { R10k, AV_PIX_FMT_GBRP10LE,    r10k_to_gbrp10le },
                 { R10k, AV_PIX_FMT_GBRP16LE,    r10k_to_gbrp16le },
diff --git a/src/pixfmt_conv.c b/src/pixfmt_conv.c
index ff2e98d22..8c296b8a5 100644
--- a/src/pixfmt_conv.c
+++ b/src/pixfmt_conv.c
@@ -64,6 +64,9 @@
 #include "utils/macros.h" // to_fourcc, OPTIMEZED_FOR, CLAMP
 #include "video_codec.h"
 
+#ifdef __SSE3__
+#include "pmmintrin.h"
+#endif
 #ifdef __SSSE3__
 #include "tmmintrin.h"
 #endif
@@ -3074,9 +3077,9 @@ decoder_t get_best_decoder_from(codec_t in, const codec_t *out_candidates, codec
  * neither input nor output need to be padded
  */
 void
-v210_to_p010le(char *__restrict *__restrict out_data,
+v210_to_p010le(unsigned char *__restrict *__restrict out_data,
                const int *__restrict out_linesize,
-               const char *__restrict in_data, int width, int height)
+               const unsigned char *__restrict in_data, int width, int height)
 {
         assert((uintptr_t) in_data % 4 == 0);
         assert(out_linesize[0] % 2 == 0);
@@ -3168,9 +3171,9 @@ v210_to_p010le(char *__restrict *__restrict out_data,
 }
 
 void
-y216_to_p010le(char *__restrict *__restrict out_data,
+y216_to_p010le(unsigned char *__restrict *__restrict out_data,
                const int *__restrict out_linesize,
-               const char *__restrict in_data, int width, int height)
+               const unsigned char *__restrict in_data, int width, int height)
 {
         const size_t src_linesize = vc_get_linesize(width, Y216);
         for (int i = 0; i < height / 2; ++i) {
@@ -3195,4 +3198,153 @@ y216_to_p010le(char *__restrict *__restrict out_data,
         }
 }
 
+void
+uyvy_to_nv12(unsigned char *__restrict *__restrict out_data,
+             const int *__restrict out_linesize,
+             const unsigned char *__restrict in_data, int width, int height)
+{
+        for (size_t y = 0; y < (size_t) height; y += 2) {
+                /*  every even row */
+                const unsigned char *src = in_data + (y * ((size_t) width * 2));
+                /*  every odd row */
+                const unsigned char *src2 = src + ((size_t) width * 2);
+                unsigned char *dst_y      = out_data[0] + (out_linesize[0] * y);
+                unsigned char *dst_y2     = dst_y + out_linesize[0];
+                unsigned char *dst_cbcr =
+                    out_data[1] + (out_linesize[1] * (y / 2));
+
+                int x = 0;
+#ifdef __SSE3__
+                __m128i yuv;
+                __m128i yuv2;
+                __m128i y1;
+                __m128i y2;
+                __m128i y3;
+                __m128i y4;
+                __m128i uv;
+                __m128i uv2;
+                __m128i uv3;
+                __m128i uv4;
+                __m128i ymask = _mm_set1_epi32(0xFF00FF00);
+                __m128i dsty;
+                __m128i dsty2;
+                __m128i dstuv;
+
+                for (; x < (width - 15); x += 16){
+                        yuv = _mm_lddqu_si128((__m128i const*)(const void *) src);
+                        yuv2 = _mm_lddqu_si128((__m128i const*)(const void *) src2);
+                        src += 16;
+                        src2 += 16;
+
+                        y1 = _mm_and_si128(ymask, yuv);
+                        y1 = _mm_bsrli_si128(y1, 1);
+                        y2 = _mm_and_si128(ymask, yuv2);
+                        y2 = _mm_bsrli_si128(y2, 1);
+
+                        uv = _mm_andnot_si128(ymask, yuv);
+                        uv2 = _mm_andnot_si128(ymask, yuv2);
+
+                        uv = _mm_avg_epu8(uv, uv2);
+
+                        yuv = _mm_lddqu_si128((__m128i const*)(const void *) src);
+                        yuv2 = _mm_lddqu_si128((__m128i const*)(const void *) src2);
+                        src += 16;
+                        src2 += 16;
+
+                        y3 = _mm_and_si128(ymask, yuv);
+                        y3 = _mm_bsrli_si128(y3, 1);
+                        y4 = _mm_and_si128(ymask, yuv2);
+                        y4 = _mm_bsrli_si128(y4, 1);
+
+                        uv3 = _mm_andnot_si128(ymask, yuv);
+                        uv4 = _mm_andnot_si128(ymask, yuv2);
+
+                        uv3 = _mm_avg_epu8(uv3, uv4);
+
+                        dsty = _mm_packus_epi16(y1, y3);
+                        dsty2 = _mm_packus_epi16(y2, y4);
+                        dstuv = _mm_packus_epi16(uv, uv3);
+                        _mm_storeu_si128((__m128i *)(void *) dst_y, dsty);
+                        _mm_storeu_si128((__m128i *)(void *) dst_y2, dsty2);
+                        _mm_storeu_si128((__m128i *)(void *) dst_cbcr, dstuv);
+                        dst_y += 16;
+                        dst_y2 += 16;
+                        dst_cbcr += 16;
+                }
+#endif
+
+                OPTIMIZED_FOR (; x < width - 1; x += 2) {
+                        *dst_cbcr++ = (*src++ + *src2++) / 2;
+                        *dst_y++ = *src++;
+                        *dst_y2++ = *src2++;
+                        *dst_cbcr++ = (*src++ + *src2++) / 2;
+                        *dst_y++ = *src++;
+                        *dst_y2++ = *src2++;
+                }
+        }
+}
+
+void
+rgba_to_bgra(unsigned char *__restrict *__restrict out_data,
+             const int *__restrict out_linesize,
+             const unsigned char *__restrict in_data, int width, int height)
+{
+        const size_t src_linesize = vc_get_linesize(width, RGBA);
+        for (size_t i = 0; i < (size_t) height; ++i) {
+                const uint8_t *in  = in_data + (i * src_linesize);
+                uint8_t       *out = out_data[0] + (i * out_linesize[0]);
+                for (int i = 0; i < width; ++i) {
+                        *out++ = in[2]; // B
+                        *out++ = in[1]; // G
+                        *out++ = in[0]; // R
+                        *out++ = in[3]; // A
+                        in += 4;
+                }
+        }
+}
+
+/**
+ * converts UYVY to planar YUV 4:2:0
+ *
+ * @sa uyvy_to_i422
+ */
+void
+uyvy_to_i420(unsigned char *__restrict *__restrict out_data,
+             const int *__restrict out_linesize, const unsigned char *__restrict in_data,
+             int width, int height)
+{
+        size_t                     src_linesize = vc_get_linesize(width, UYVY);
+        for (size_t i = 0; i < (size_t) (height + 1) / 2; ++i) {
+                const unsigned char *in1 = in_data + (2 * i * src_linesize);
+                const unsigned char *in2 = in1 + src_linesize;
+                unsigned char       *y1 =
+                    out_data[0] + ((2ULL * i) * out_linesize[0]);
+                unsigned char *y2 = y1 + out_linesize[0];
+                unsigned char *u  = out_data[1] + (i * out_linesize[1]);
+                unsigned char *v  = out_data[2] + (i * out_linesize[2]);
+
+                // handle height % 2 == 1
+                if (i + 1 == (size_t) height) {
+                        y2  = y1;
+                        in2 = in1;
+                }
+
+                int j = 0;
+                for (; j < width / 2; ++j) {
+                        *u++  = (*in1++ + *in2++ + 1) / 2;
+                        *y1++ = *in1++;
+                        *y2++ = *in2++;
+                        *v++  = (*in1++ + *in2++ + 1) / 2;
+                        *y1++ = *in1++;
+                        *y2++ = *in2++;
+                }
+                if (width % 2 == 1) { // do not overwrite EOL
+                        *u++  = (*in1++ + *in2++ + 1) / 2;
+                        *y1++ = *in1++;
+                        *y2++ = *in2++;
+                        *v++  = (*in1++ + *in2++ + 1) / 2;
+                }
+        }
+}
+
 /* vim: set expandtab sw=8: */
diff --git a/src/pixfmt_conv.h b/src/pixfmt_conv.h
index 2ab1d7a2b..50d24a113 100644
--- a/src/pixfmt_conv.h
+++ b/src/pixfmt_conv.h
@@ -106,12 +106,18 @@ decoder_func_t vc_copylineUYVYtoGrayscale;
 /// dummy conversion - ptr to it returned if no conversion needed
 decoder_func_t vc_memcpy;
 
-void v210_to_p010le(char *__restrict *__restrict out_data,
-                    const int *__restrict out_linesize,
-                    const char *__restrict in_data, int width, int height);
-void y216_to_p010le(char *__restrict *__restrict out_data,
-                    const int *__restrict out_linesize,
-                    const char *__restrict in_data, int width, int height);
+
+typedef void
+decode_buffer_func_t(unsigned char *__restrict *__restrict out_data,
+                     const int *__restrict out_linesize,
+                     const unsigned char *__restrict in_data, int width,
+                     int height);
+decode_buffer_func_t v210_to_p010le;
+decode_buffer_func_t y216_to_p010le;
+decode_buffer_func_t uyvy_to_nv12;
+decode_buffer_func_t rgba_to_bgra;
+// other packed->planar convs are histaorically in video_codec.[ch]
+decode_buffer_func_t uyvy_to_i420;
 
 #ifdef __cplusplus
 }
diff --git a/src/video_capture/testcard_common.c b/src/video_capture/testcard_common.c
index aa97766dd..a5812501a 100644
--- a/src/video_capture/testcard_common.c
+++ b/src/video_capture/testcard_common.c
@@ -71,46 +71,21 @@ void testcard_fillRect(struct testcard_pixmap *s, struct testcard_rect *r, uint3
 
 /**
  * @param[in] in buffer in UYVY
- * @retval       buffer in I420 (must be deallocated by the caller)
- * @note
- * Caller must deallocate returned buffer
+ * @retval       buffer in I420
  */
-static void toI420(unsigned char *out, const unsigned char *input, int width, int height)
+static void
+toI420(unsigned char *out, const unsigned char *input, int width, int height)
 {
-        const unsigned char *in = (const unsigned char *) input;
-        int w_ch = (width + 1) / 2;
-        int h_ch = (height + 1) / 2;
-        unsigned char *y = out;
-        unsigned char *u0 = out + width * height;
-        unsigned char *v0 = out + width * height + w_ch * h_ch;
-        unsigned char *u1 = u0, *v1 = v0;
-
-        for (int i = 0; i < height; i += 1) {
-                for (int j = 0; j < ((width + 1) & ~1); j += 2) {
-                        // U
-                        if (i % 2 == 0) {
-                                *u0++ = *in++;
-                        } else { // average with every 2nd row
-                                *u1 = (*u1 + *in++) / 2;
-                                u1++;
-                        }
-                        // Y
-                        *y++ = *in++;
-                        // V
-                        if (i % 2 == 0) {
-                                *v0++ = *in++;
-                        } else { // average with every 2nd row
-                                *v1 = (*v1 + *in++) / 2;
-                                v1++;
-                        }
-                        // Y
-                        if (j + 1 == width) {
-                                in++;
-                        } else {
-                                *y++ = *in++;
-                        }
-                }
-        }
+        const size_t y_h = height;
+        const size_t chr_h = (y_h + 1) / 2;
+        int          out_linesize[3] = { width,
+                                         (width + 1) / 2,
+                                         (width + 1) / 2 };
+        unsigned char *out_data[3] = { out,
+                                       out + (y_h * out_linesize[0]),
+                                       out + (y_h * out_linesize[0]) +
+                                           (chr_h * out_linesize[1]) };
+        uyvy_to_i420(out_data, out_linesize, input, width, height);
 }
 
 void testcard_convert_buffer(codec_t in_c, codec_t out_c, unsigned char *out, unsigned const char *in, int width, int height)
diff --git a/src/video_display/sdl3.c b/src/video_display/sdl3.c
index 8ce6d7bcc..1cccd2801 100644
--- a/src/video_display/sdl3.c
+++ b/src/video_display/sdl3.c
@@ -95,22 +95,24 @@ struct video_frame_sdl3_data {
 };
 
 static void convert_UYVY_IYUV(const struct video_frame *uv_frame,
-                              char *tex_data, size_t y_pitch);
+                              unsigned char *tex_data, size_t y_pitch);
+static void convert_UYVY_NV12(const struct video_frame *uv_frame,
+                              unsigned char *tex_data, size_t y_pitch);
 static void convert_R10k_ARGB2101010(const struct video_frame *uv_frame,
-                                     char *tex_data, size_t y_pitch);
+                                     unsigned char *tex_data, size_t y_pitch);
 static void convert_R10k_ABGR2101010(const struct video_frame *uv_frame,
-                                     char *tex_data, size_t y_pitch);
+                                     unsigned char *tex_data, size_t y_pitch);
 static void convert_RGBA_BGRA(const struct video_frame *uv_frame,
-                                     char *tex_data, size_t y_pitch);
-static void convert_Y216_P010(const struct video_frame *uv_frame, char *tex_data,
-                          size_t y_pitch);
-static void convert_v210_P010(const struct video_frame *uv_frame, char *tex_data,
-                          size_t y_pitch);
+                              unsigned char *tex_data, size_t y_pitch);
+static void convert_Y216_P010(const struct video_frame *uv_frame,
+                              unsigned char *tex_data, size_t y_pitch);
+static void convert_v210_P010(const struct video_frame *uv_frame,
+                              unsigned char *tex_data, size_t y_pitch);
 struct fmt_data {
         codec_t              ug_codec;
         enum SDL_PixelFormat sdl_tex_fmt;
-        void (*convert)(const struct video_frame *uv_frame, char *tex_data,
-                        size_t tex_pitch);
+        void (*convert)(const struct video_frame *uv_frame,
+                        unsigned char *tex_data, size_t tex_pitch);
 };
 // order matters relative to fixed ug codec - first usable SDL fmt is used
 static const struct fmt_data pf_mapping_template[] = {
@@ -121,6 +123,7 @@ static const struct fmt_data pf_mapping_template[] = {
         { RGBA, SDL_PIXELFORMAT_BGRX32,      convert_RGBA_BGRA        }, // gles2,ogl,gpu,sw,vk,d3d12
         { UYVY, SDL_PIXELFORMAT_UYVY,        NULL                     }, // mac ogl
         { UYVY, SDL_PIXELFORMAT_IYUV,        convert_UYVY_IYUV        }, // fallback
+        { UYVY, SDL_PIXELFORMAT_NV12,        convert_UYVY_NV12        }, // ditto
         { YUYV, SDL_PIXELFORMAT_YUY2,        NULL                     },
         { RGB,  SDL_PIXELFORMAT_RGB24,       NULL                     },
         { BGR,  SDL_PIXELFORMAT_BGR24,       NULL                     },
@@ -230,7 +233,7 @@ display_frame(struct state_sdl3 *s, struct video_frame *frame)
 
         int pitch = 0;
         if (s->cs_data->convert != NULL) {
-                char *tex_data = NULL;
+                unsigned char *tex_data = NULL;
                 SDL_CHECK(SDL_LockTexture(frame_data->texture, NULL,
                                           (void **) &tex_data, &pitch));
                 s->cs_data->convert(frame, tex_data, pitch);
@@ -1093,8 +1096,8 @@ display_sdl3_getf(void *state)
 }
 
 static void
-convert_R10k_ARGB2101010(const struct video_frame *uv_frame, char *tex_data,
-                  size_t pitch)
+convert_R10k_ARGB2101010(const struct video_frame *uv_frame,
+                         unsigned char *tex_data, size_t pitch)
 {
         assert(pitch == (size_t) uv_frame->tiles[0].width * 4);
         assert((uintptr_t) uv_frame->tiles[0].data % 4 == 0);
@@ -1121,8 +1124,8 @@ convert_R10k_ARGB2101010(const struct video_frame *uv_frame, char *tex_data,
 }
 
 static void
-convert_R10k_ABGR2101010(const struct video_frame *uv_frame, char *tex_data,
-                  size_t pitch)
+convert_R10k_ABGR2101010(const struct video_frame *uv_frame,
+                         unsigned char *tex_data, size_t pitch)
 {
         const size_t src_linesize = vc_get_linesize(uv_frame->tiles[0].width, R10k);
         for (unsigned i = 0; i < uv_frame->tiles[0].height; ++i) {
@@ -1142,58 +1145,45 @@ convert_R10k_ABGR2101010(const struct video_frame *uv_frame, char *tex_data,
 }
 
 static void
-convert_RGBA_BGRA(const struct video_frame *uv_frame, char *tex_data,
+convert_RGBA_BGRA(const struct video_frame *uv_frame, unsigned char *tex_data,
                   size_t pitch)
 {
-        const size_t src_linesize = vc_get_linesize(uv_frame->tiles[0].width, RGBA);
-        for (unsigned i = 0; i < uv_frame->tiles[0].height; ++i) {
-                const uint8_t *in =
-                    (uint8_t *) uv_frame->tiles[0].data + (i * src_linesize);
-                uint8_t *out = (uint8_t *) tex_data + (i * pitch);
-                for (unsigned i = 0; i < uv_frame->tiles[0].width ; ++i) {
-                        *out++ = in[2]; // B
-                        *out++ = in[1]; // G
-                        *out++ = in[0]; // R
-                        *out++ = in[3]; // A
-                        in += 4;
-                }
-        }
+        unsigned char *out_data[2]   = { tex_data, 0 };
+        int out_linesize[2] = { (int) pitch, 0 };
+        rgba_to_bgra(
+            out_data, out_linesize, (unsigned char *) uv_frame->tiles[0].data,
+            (int) uv_frame->tiles[0].width, (int) uv_frame->tiles[0].height);
 }
 
 static void
-convert_UYVY_IYUV(const struct video_frame *uv_frame, char *tex_data,
+convert_UYVY_IYUV(const struct video_frame *uv_frame, unsigned char *tex_data,
                   size_t y_pitch)
 {
-        size_t cr_pitch = (y_pitch + 1) / 2;
-        char  *ubase    = tex_data + (y_pitch * uv_frame->tiles[0].height);
-        char  *vbase =
-            ubase + (cr_pitch * ((uv_frame->tiles[0].height + 1) / 2));
-        const char *in = uv_frame->tiles[0].data;
-        for (unsigned i = 0; i < (uv_frame->tiles[0].height + 1) / 2; ++i) {
-                char *y1 = tex_data + ((2ULL * i) * y_pitch);
-                char *y2 = y1 + y_pitch;
-                char *u  = ubase + (i * cr_pitch);
-                char *v  = vbase + (i * cr_pitch);
-                for (unsigned j = 0; j < (uv_frame->tiles[0].width + 1) / 2;
-                     ++j) {
-                        *u++  = *in++;
-                        *y1++ = *in++;
-                        *v++  = *in++;
-                        *y1++ = *in++;
-                }
-                // last line when height % 2 == 1
-                if (i * 2 + 1 == uv_frame->tiles[0].height) {
-                        break;
-                }
-                // take just lumas from second
-                for (unsigned j = 0; j < (uv_frame->tiles[0].width + 1) / 2;
-                     ++j) {
-                        in++; // drop U
-                        *y2++ = *in++;
-                        in++; // drop V
-                        *y2++ = *in++;
-                }
-        }
+        const size_t y_h = uv_frame->tiles[0].height;
+        const size_t chr_h = (y_h + 1) / 2;
+        int          out_linesize[3] = { (int) y_pitch,
+                                         (int) (y_pitch + 1) / 2,
+                                         (int) (y_pitch + 1) / 2 };
+        unsigned char *out_data[3] = { tex_data,
+                                       tex_data + (y_h * out_linesize[0]),
+                                       tex_data + (y_h * out_linesize[0]) +
+                                           (chr_h * out_linesize[1]) };
+        uyvy_to_i420(
+            out_data, out_linesize, (unsigned char *) uv_frame->tiles[0].data,
+            (int) uv_frame->tiles[0].width, (int) uv_frame->tiles[0].height);
+}
+
+static void
+convert_UYVY_NV12(const struct video_frame *uv_frame, unsigned char *tex_data,
+                  size_t y_pitch)
+{
+        unsigned char *out_data[2] = {
+                tex_data, tex_data + (y_pitch * uv_frame->tiles[0].height)
+        };
+        int out_linesize[2] = { (int) y_pitch, (int) ((y_pitch + 1) / 2) * 2 };
+        uyvy_to_nv12(
+            out_data, out_linesize, (unsigned char *) uv_frame->tiles[0].data,
+            (int) uv_frame->tiles[0].width, (int) uv_frame->tiles[0].height);
 }
 
 /**
@@ -1201,30 +1191,30 @@ convert_UYVY_IYUV(const struct video_frame *uv_frame, char *tex_data,
  * currently seem to work only on Metal
  */
 static void
-convert_Y216_P010(const struct video_frame *uv_frame, char *tex_data,
+convert_Y216_P010(const struct video_frame *uv_frame, unsigned char *tex_data,
                   size_t y_pitch)
 {
-        char *out_data[2] = {
+        unsigned char *out_data[2] = {
                 tex_data, tex_data + (y_pitch * uv_frame->tiles[0].height)
         };
         int out_linesize[2] = { (int) y_pitch, (int) ((y_pitch + 1) / 2) * 2 };
-        y216_to_p010le(out_data, out_linesize, uv_frame->tiles[0].data,
-                       (int) uv_frame->tiles[0].width,
-                       (int) uv_frame->tiles[0].height);
+        y216_to_p010le(
+            out_data, out_linesize, (unsigned char *) uv_frame->tiles[0].data,
+            (int) uv_frame->tiles[0].width, (int) uv_frame->tiles[0].height);
 }
 
 /// @copydoc convert_Y216_P010
 static void
-convert_v210_P010(const struct video_frame *uv_frame, char *tex_data,
+convert_v210_P010(const struct video_frame *uv_frame, unsigned char *tex_data,
                   size_t y_pitch)
 {
-        char *out_data[2] = {
+        unsigned char *out_data[2] = {
                 tex_data, tex_data + (y_pitch * uv_frame->tiles[0].height)
         };
         int out_linesize[2] = { (int) y_pitch, (int) ((y_pitch + 1) / 2) * 2 };
-        v210_to_p010le(out_data, out_linesize, uv_frame->tiles[0].data,
-                       (int) uv_frame->tiles[0].width,
-                       (int) uv_frame->tiles[0].height);
+        v210_to_p010le(
+            out_data, out_linesize, (unsigned char *) uv_frame->tiles[0].data,
+            (int) uv_frame->tiles[0].width, (int) uv_frame->tiles[0].height);
 }
 
 static bool