diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.2-rpi_11.patch b/.github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.3-rpi_13.patch
similarity index 88%
rename from .github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.2-rpi_11.patch
rename to .github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.3-rpi_13.patch
index 353215061..579ed3d14 100644
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.2-rpi_11.patch
+++ b/.github/scripts/Linux/arm/ffmpeg-arm-patches/ffmpeg-4.3.3-rpi_13.patch
@@ -1,14 +1,22 @@
 --- a/configure
 +++ b/configure
-@@ -274,6 +274,7 @@ External library support:
+@@ -207,6 +207,7 @@ External library support:
+   --disable-bzlib          disable bzlib [autodetect]
+   --disable-coreimage      disable Apple CoreImage framework [autodetect]
+   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
++  --disable-epoxy          disable epoxy [autodetect]
+   --enable-frei0r          enable frei0r video filtering [no]
+   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
+                            if openssl, librtmp or gmp is not used [no]
+@@ -274,6 +275,7 @@ External library support:
    --enable-libtls          enable LibreSSL (via libtls), needed for https support
                             if openssl, gnutls or mbedtls is not used [no]
    --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-+  --enable-libudev         enable libudev [no]
++  --disable-libudev        disable libudev [autodetect]
    --enable-libv4l2         enable libv4l2/v4l-utils [no]
    --enable-libvidstab      enable video stabilization using vid.stab [no]
    --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -336,12 +337,17 @@ External library support:
+@@ -336,12 +338,17 @@ External library support:
    --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
    --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
    --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
@@ -26,23 +34,17 @@
    --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
    --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
    --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1771,6 +1777,7 @@ EXTERNAL_LIBRARY_LIST="
-     libdav1d
-     libdc1394
-     libdrm
+@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+     avfoundation
+     bzlib
+     coreimage
 +    epoxy
-     libflite
-     libfontconfig
-     libfreetype
-@@ -1807,6 +1814,7 @@ EXTERNAL_LIBRARY_LIST="
-     libtesseract
-     libtheora
-     libtwolame
+     iconv
 +    libudev
-     libv4l2
-     libvorbis
-     libvpx
-@@ -1861,7 +1869,10 @@ HWACCEL_LIBRARY_LIST="
+     libxcb
+     libxcb_shm
+     libxcb_shape
+@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
      mmal
      omx
      opencl
@@ -53,7 +55,7 @@
  "
  
  DOCUMENT_LIST="
-@@ -1877,12 +1888,16 @@ FEATURE_LIST="
+@@ -1877,12 +1889,16 @@ FEATURE_LIST="
      gray
      hardcoded_tables
      omx_rpi
@@ -70,7 +72,7 @@
  "
  
  # this list should be kept in linking order
-@@ -1923,6 +1938,7 @@ SUBSYSTEM_LIST="
+@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
      pixelutils
      network
      rdft
@@ -78,7 +80,7 @@
  "
  
  # COMPONENT_LIST needs to come last to ensure correct dependency checking
-@@ -2405,9 +2421,11 @@ CONFIG_EXTRA="
+@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
      rangecoder
      riffdec
      riffenc
@@ -90,7 +92,7 @@
      scene_sad
      sinewin
      snappy
-@@ -2737,6 +2755,8 @@ hap_decoder_select="snappy texturedsp"
+@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
  hap_encoder_deps="libsnappy"
  hap_encoder_select="texturedspenc"
  hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
@@ -99,7 +101,7 @@
  huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
  huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
  hymt_decoder_select="huffyuv_decoder"
-@@ -2903,6 +2923,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
+@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
  dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
  ffnvcodec_deps_any="libdl LoadLibrary"
  nvdec_deps="ffnvcodec"
@@ -107,7 +109,7 @@
  vaapi_x11_deps="xlib"
  videotoolbox_hwaccel_deps="videotoolbox pthreads"
  videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -2934,6 +2955,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
+@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
  hevc_dxva2_hwaccel_select="hevc_decoder"
  hevc_nvdec_hwaccel_deps="nvdec"
  hevc_nvdec_hwaccel_select="hevc_decoder"
@@ -120,16 +122,15 @@
  hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
  hevc_vaapi_hwaccel_select="hevc_decoder"
  hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -3401,8 +3428,14 @@ sndio_indev_deps="sndio"
+@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
  sndio_outdev_deps="sndio"
  v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
  v4l2_indev_suggest="libv4l2"
 +v4l2_outdev_deps="libdrm"
  v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
  v4l2_outdev_suggest="libv4l2"
-+vout_drm_outdev_deps="libdrm vout_drm"
-+vout_egl_outdev_deps="xlib"
-+vout_egl_outdev_select="epoxy"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
 +vout_rpi_outdev_deps="rpi"
 +vout_rpi_outdev_select="sand"
  vfwcap_indev_deps="vfw32 vfwcap_defines"
@@ -143,23 +144,20 @@
  unsharp_opencl_filter_deps="opencl"
  uspp_filter_deps="gpl avcodec"
  vaguedenoiser_filter_deps="gpl"
-@@ -6299,6 +6333,7 @@ enabled libdav1d          && require_pkg
- enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
- enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
- enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
-+enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
- enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
-                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
-                                  warn "using libfdk without pkg-config"; } }
-@@ -6376,6 +6411,7 @@ enabled libtls            && require_pkg
- enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
-+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
-@@ -6430,11 +6466,12 @@ enabled mbedtls           && { check_pkg
+@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
+ enabled xlib &&
+     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
+ 
++enabled libudev &&
++    check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6430,11 +6470,12 @@ enabled mbedtls           && { check_pkg
                                 check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
                                 die "ERROR: mbedTLS not found"; }
  enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
@@ -174,18 +172,24 @@
                                 die "ERROR: mmal not found" &&
                                 check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
  enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
-@@ -6475,6 +6512,10 @@ enabled rkmpp             && { require_p
+@@ -6475,8 +6516,16 @@ enabled rkmpp             && { require_p
                                 { enabled libdrm ||
                                   die "ERROR: rkmpp requires --enable-libdrm"; }
                               }
 +enabled v4l2_request      && { enabled libdrm ||
 +                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
 +                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires --enable-libudev"; }
++                               die "ERROR: v4l2-request requires libudev"; }
  enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
  
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
  
-@@ -6556,6 +6597,8 @@ if enabled v4l2_m2m; then
+ if enabled gcrypt; then
+     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
      check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
  fi
  
@@ -411,7 +415,7 @@
 +OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
++                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
  OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
  OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
  OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
@@ -447,6 +451,1866 @@
 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
 +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
 +endif
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
+ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
+                                            aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
++                                           aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
+ 
+ # decoders/encoders
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+ 
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+-            c->idct_put  = ff_simple_idct_put_neon;
+-            c->idct_add  = ff_simple_idct_add_neon;
+-            c->idct      = ff_simple_idct_neon;
+-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
++    if (have_neon(cpu_flags)) {
++        if (!avctx->lowres && !high_bit_depth) {
++            if (avctx->idct_algo == FF_IDCT_AUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++                c->idct_put  = ff_simple_idct_put_neon;
++                c->idct_add  = ff_simple_idct_add_neon;
++                c->idct      = ff_simple_idct_neon;
++                c->perm_type = FF_IDCT_PERM_PARTTRANS;
++            }
+         }
++
++        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
++        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
++        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+     }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x1], x2
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x1], x2
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x1], x2
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x1], x2
++        st1             {v4.8b}, [x1], x2
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1]
++        ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        movi            v4.8b, #128
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        sqxtn           v0.8b, v0.8h
++        sqxtn           v1.8b, v1.8h
++        sqxtn           v2.8b, v2.8h
++        sqxtn           v3.8b, v3.8h
++        sqxtn           v5.8b, v16.8h
++        add             v0.8b, v0.8b, v4.8b
++        sqxtn           v6.8b, v17.8h
++        add             v1.8b, v1.8b, v4.8b
++        sqxtn           v7.8b, v18.8h
++        add             v2.8b, v2.8b, v4.8b
++        sqxtn           v16.8b, v19.8h
++        add             v3.8b, v3.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        add             v0.8b, v5.8b, v4.8b
++        st1             {v1.8b}, [x1], x2
++        add             v1.8b, v6.8b, v4.8b
++        st1             {v2.8b}, [x1], x2
++        add             v2.8b, v7.8b, v4.8b
++        st1             {v3.8b}, [x1], x2
++        add             v3.8b, v16.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1], x2
++        st1             {v3.8b}, [x1]
++        ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit input and results
++//   x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        mov             x3, x1
++        ld1             {v4.8b}, [x1], x2
++        ld1             {v5.8b}, [x1], x2
++        ld1             {v6.8b}, [x1], x2
++        ld1             {v7.8b}, [x1], x2
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        uaddw           v0.8h, v0.8h, v4.8b
++        uaddw           v1.8h, v1.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        ld1             {v4.8b}, [x1], x2
++        uaddw           v3.8h, v3.8h, v7.8b
++        ld1             {v5.8b}, [x1], x2
++        sqxtun          v0.8b, v0.8h
++        ld1             {v6.8b}, [x1], x2
++        sqxtun          v1.8b, v1.8h
++        ld1             {v7.8b}, [x1]
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        uaddw           v4.8h, v16.8h, v4.8b
++        st1             {v0.8b}, [x3], x2
++        uaddw           v0.8h, v17.8h, v5.8b
++        st1             {v1.8b}, [x3], x2
++        uaddw           v1.8h, v18.8h, v6.8b
++        st1             {v2.8b}, [x3], x2
++        uaddw           v2.8h, v19.8h, v7.8b
++        sqxtun          v4.8b, v4.8h
++        sqxtun          v0.8b, v0.8h
++        st1             {v3.8b}, [x3], x2
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        st1             {v4.8b}, [x3], x2
++        st1             {v0.8b}, [x3], x2
++        st1             {v1.8b}, [x3], x2
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ 
+ #include "config.h"
+ 
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_neon(cpu_flags)) {
++        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+     }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1546 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++//   array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++        ld1             {v1.16b, v2.16b}, [x0], #32
++        ld1             {v3.16b, v4.16b}, [x0], #32
++        ld1             {v5.16b, v6.16b}, [x0], #32
++        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
++        sub             x1, x0, #3*32
++        ld1             {v16.16b, v17.16b}, [x0]
++        shl             v7.8h, v2.8h, #4        //          16 * src[8]
++        shl             v18.8h, v2.8h, #2       //           4 * src[8]
++        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
++        ldr             d0, .Lcoeffs_it8
++        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
++        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
++        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
++        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
++        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
++        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
++        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
++        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
++        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
++        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
++        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
++        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
++        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
++        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
++        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
++        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
++        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
++        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
++        neg             v3.8h, v7.8h            // -t1
++        neg             v4.8h, v20.8h           // +t2
++        neg             v6.8h, v19.8h           // +t3
++        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
++        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
++        neg             v7.8h, v18.8h           // +t4
++        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
++        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
++        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
++        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
++        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
++        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
++        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
++        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
++        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
++        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
++        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
++        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
++        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
++        trn2            v17.8h, v3.8h, v4.8h
++        trn2            v18.8h, v5.8h, v6.8h
++        trn2            v19.8h, v2.8h, v1.8h
++        trn2            v20.8h, v7.8h, v16.8h
++        trn1            v21.4s, v17.4s, v18.4s
++        trn2            v17.4s, v17.4s, v18.4s
++        trn1            v18.4s, v19.4s, v20.4s
++        trn2            v19.4s, v19.4s, v20.4s
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.2d, v21.2d, v18.2d
++        trn1            v20.2d, v17.2d, v19.2d
++        trn1            v5.8h, v5.8h, v6.8h
++        trn1            v1.8h, v2.8h, v1.8h
++        trn1            v2.8h, v7.8h, v16.8h
++        trn1            v6.2d, v21.2d, v18.2d
++        trn2            v7.2d, v17.2d, v19.2d
++        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
++        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
++        trn1            v18.4s, v3.4s, v5.4s
++        trn1            v19.4s, v1.4s, v2.4s
++        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
++        shl             v22.8h, v6.8h, #2       //           4 * src[8]
++        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
++        trn2            v3.4s, v3.4s, v5.4s
++        trn2            v1.4s, v1.4s, v2.4s
++        shl             v2.8h, v6.8h, #4        //          16 * src[8]
++        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
++        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
++        trn1            v22.2d, v18.2d, v19.2d
++        trn2            v18.2d, v18.2d, v19.2d
++        trn1            v19.2d, v3.2d, v1.2d
++        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
++        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
++        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
++        trn2            v1.2d, v3.2d, v1.2d
++        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
++        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
++        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
++        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
++        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
++        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        neg             v21.8h, v17.8h          // +t2
++        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
++        neg             v4.8h, v5.8h            // +t3
++        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
++        neg             v24.8h, v16.8h          // +t4
++        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
++        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
++        neg             v3.8h, v2.8h            // -t1
++        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
++        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
++        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
++        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
++        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
++        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
++        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
++        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
++        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
++        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
++        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
++        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
++        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
++        st1             {v2.16b, v3.16b}, [x1], #32
++        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
++        st1             {v4.16b, v5.16b}, [x1], #32
++        st1             {v16.16b, v17.16b}, [x1], #32
++        st1             {v0.16b, v1.16b}, [x1]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++        mov             x3, x0
++        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        ld1             {v5.8b}, [x0], x1
++        trn2            v6.4h, v1.4h, v3.4h
++        trn2            v7.4h, v2.4h, v4.4h
++        trn1            v1.4h, v1.4h, v3.4h
++        trn1            v2.4h, v2.4h, v4.4h
++        trn2            v3.4h, v16.4h, v18.4h
++        trn2            v4.4h, v17.4h, v19.4h
++        trn1            v16.4h, v16.4h, v18.4h
++        trn1            v17.4h, v17.4h, v19.4h
++        ld1             {v18.8b}, [x0], x1
++        trn1            v19.2s, v6.2s, v3.2s
++        trn2            v3.2s, v6.2s, v3.2s
++        trn1            v6.2s, v7.2s, v4.2s
++        trn2            v4.2s, v7.2s, v4.2s
++        trn1            v7.2s, v1.2s, v16.2s
++        trn1            v20.2s, v2.2s, v17.2s
++        shl             v21.4h, v19.4h, #4      //          16 * src[1]
++        trn2            v1.2s, v1.2s, v16.2s
++        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
++        trn2            v2.2s, v2.2s, v17.2s
++        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
++        ld1             {v22.8b}, [x0], x1
++        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
++        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
++        ld1             {v25.8b}, [x0]
++        shl             v26.4h, v19.4h, #2      //           4 * src[1]
++        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
++        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
++        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
++        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
++        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
++        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
++        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
++        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
++        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
++        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
++        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
++        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
++        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
++        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
++        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
++        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
++        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
++        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
++        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
++        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
++        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
++        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
++        neg             v6.4h, v21.4h           // -t1
++        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
++        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
++        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
++        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
++        neg             v3.4h, v17.4h           // +t2
++        neg             v4.4h, v16.4h           // +t3
++        neg             v28.4h, v23.4h          // +t4
++        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
++        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
++        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
++        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
++        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
++        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
++        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
++        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
++        trn1            v1.2d, v7.2d, v1.2d
++        trn1            v2.2d, v20.2d, v2.2d
++        trn1            v3.2d, v24.2d, v27.2d
++        trn1            v4.2d, v19.2d, v26.2d
++        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++        trn2            v6.8h, v1.8h, v2.8h
++        trn1            v1.8h, v1.8h, v2.8h
++        trn2            v2.8h, v3.8h, v4.8h
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.4s, v6.4s, v2.4s
++        trn1            v7.4s, v1.4s, v3.4s
++        trn2            v1.4s, v1.4s, v3.4s
++        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
++        trn1            v2.4s, v6.4s, v2.4s
++        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
++        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
++        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v2.8h, v3.8h            // -t4/2
++        neg             v6.8h, v4.8h            // -t3/2
++        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
++        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
++        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
++        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
++        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
++        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
++        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
++        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v18.8b
++        uaddw           v2.8h, v2.8h, v22.8b
++        uaddw           v3.8h, v3.8h, v25.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++        mov             x3, #16
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
++        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
++        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
++        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
++        ld1             {v4.d}[1], [x2]         // 70 71 72 73
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        ld1             {v7.s}[0], [x0], x1
++        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
++        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
++        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
++        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
++        ld1             {v4.s}[0], [x0], x1
++        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
++        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
++        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
++        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
++        ld1             {v5.s}[1], [x0], x1
++        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
++        ld1             {v6.s}[1], [x0], x1
++        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
++        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
++        ld1             {v7.s}[1], [x0], x1
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
++        ld1             {v4.s}[1], [x0]
++        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v3.8h, v16.8h           // -t3/2
++        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
++        neg             v18.8h, v17.8h          // -t4/2
++        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
++        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
++        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
++        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
++        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
++        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
++        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
++        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
++        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
++        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
++        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
++        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
++        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
++        mov             d18, v3.d[1]            // 50 51 52 53
++        shl             v19.4h, v3.4h, #4       //          16 * src[8]
++        mov             d20, v16.d[1]           // 70 71 72 73
++        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
++        mov             d22, v17.d[1]           // 40 41 42 43
++        shl             v23.4h, v3.4h, #2       //           4 * src[8]
++        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
++        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
++        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
++        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
++        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
++        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
++        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
++        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
++        mov             d23, v1.d[1]            // 60 61 62 63
++        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
++        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
++        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
++        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
++        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
++        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
++        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
++        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        neg             v23.4h, v24.4h          // +t2
++        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
++        neg             v17.4h, v21.4h          // +t3
++        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
++        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        neg             v16.4h, v19.4h          // -t1
++        neg             v27.4h, v2.4h           // +t4
++        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
++        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
++        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
++        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
++        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
++        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
++        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
++        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
++        trn1            v0.2d, v20.2d, v0.2d
++        trn1            v2.2d, v18.2d, v22.2d
++        trn1            v3.2d, v25.2d, v3.2d
++        trn1            v1.2d, v26.2d, v1.2d
++        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        uaddw           v3.8h, v3.8h, v7.8b
++        uaddw           v1.8h, v1.8h, v4.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v2.s}[0], [x4], x1
++        st1             {v3.s}[0], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v2.s}[1], [x4], x1
++        st1             {v3.s}[1], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++        mov             x3, #16
++        ldr             d0, .Lcoeffs_it4
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2]         // 30 31 32 33
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v5.s}[1], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
++        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
++        ld1             {v6.s}[1], [x0]
++        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
++        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
++        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
++        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
++        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
++        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
++        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
++        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
++        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v7.4h, v3.4h            // -t3/2
++        neg             v16.4h, v4.4h           // -t4/2
++        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
++        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
++        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
++        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
++        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
++        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
++        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
++        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
++        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
++        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
++        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
++        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
++        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
++        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
++        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
++        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
++        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
++        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v3.4h, v2.4h            // -t4/2
++        neg             v7.4h, v4.4h            // -t3/2
++        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
++        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
++        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
++        trn1            v0.2d, v4.2d, v3.2d
++        trn1            v1.2d, v2.2d, v7.2d
++        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v6.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0], x1
++        ld1             {v4.8b}, [x0], x1
++        add             w2, w2, #1
++        ld1             {v5.8b}, [x0], x1
++        asr             w2, w2, #1
++        ld1             {v6.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v7.8b}, [x0]
++        add             w0, w2, #16
++        asr             w0, w0, #5
++        dup             v16.8h, w0
++        uaddw           v0.8h, v16.8h, v0.8b
++        uaddw           v1.8h, v16.8h, v1.8b
++        uaddw           v2.8h, v16.8h, v2.8b
++        uaddw           v3.8h, v16.8h, v3.8b
++        uaddw           v4.8h, v16.8h, v4.8b
++        uaddw           v5.8h, v16.8h, v5.8b
++        sqxtun          v0.8b, v0.8h
++        uaddw           v6.8h, v16.8h, v6.8b
++        sqxtun          v1.8b, v1.8h
++        uaddw           v7.8h, v16.8h, v7.8b
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x3], x1
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x3], x1
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x3], x1
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x3], x1
++        st1             {v4.8b}, [x3], x1
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0]
++        add             w0, w2, #1
++        asr             w0, w0, #1
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v4.8h, w0
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v2.s}[0], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v3.s}[0], [x0], x1
++        add             w2, w2, #4
++        asr             w2, w2, #3
++        add             w2, w2, w2, lsl #1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, #16
++        asr             w2, w2, #5
++        dup             v4.8h, w2
++        ld1             {v1.s}[1], [x0], x1
++        ld1             {v2.s}[1], [x0], x1
++        ld1             {v3.s}[1], [x0]
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v2.s}[0], [x3], x1
++        st1             {v3.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3], x1
++        st1             {v2.s}[1], [x3], x1
++        st1             {v3.s}[1], [x3]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v1.s}[1], [x0]
++        add             w0, w2, #4
++        asr             w0, w0, #3
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v2.8h, w0
++        uaddw           v0.8h, v2.8h, v0.8b
++        uaddw           v1.8h, v2.8h, v1.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3]
++        ret
++endfunc
++
++.align  5
++.Lcoeffs_it8:
++.quad   0x000F00090003
++.Lcoeffs_it4:
++.quad   0x0011000B0005
++.Lcoeffs:
++.quad   0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.s}[0], [x0], x1     // P5
++        ld1             {v2.s}[0], [x3], x1     // P1
++        ld1             {v3.s}[0], [x3], x1     // P2
++        ld1             {v4.s}[0], [x0], x1     // P6
++        ld1             {v5.s}[0], [x3], x1     // P3
++        ld1             {v6.s}[0], [x0], x1     // P7
++        ld1             {v7.s}[0], [x3]         // P4
++        ld1             {v16.s}[0], [x0]        // P8
++        ushll           v17.8h, v1.8b, #1       // 2*P5
++        dup             v18.8h, w2              // pq
++        ushll           v2.8h, v2.8b, #1        // 2*P1
++        uxtl            v3.8h, v3.8b            // P2
++        uxtl            v4.8h, v4.8b            // P6
++        uxtl            v19.8h, v5.8b           // P3
++        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v3.8h, v6.8b            // P7
++        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
++        ushll           v5.8h, v5.8b, #1        // 2*P3
++        uxtl            v6.8h, v7.8b            // P4
++        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v3.8h, v16.8b           // P8
++        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
++        uxtl            v1.8h, v1.8b            // P5
++        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
++        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        sub             v3.4h, v6.4h, v1.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        abs             v4.4h, v3.4h
++        srshr           v7.4h, v17.4h, #3
++        srshr           v2.4h, v2.4h, #3
++        sshr            v4.4h, v4.4h, #1        // clip
++        srshr           v5.4h, v5.4h, #3
++        abs             v7.4h, v7.4h            // a2
++        sshr            v3.4h, v3.4h, #8        // clip_sign
++        abs             v2.4h, v2.4h            // a1
++        cmeq            v16.4h, v4.4h, #0       // test clip == 0
++        abs             v17.4h, v5.4h           // a0
++        sshr            v5.4h, v5.4h, #8        // a0_sign
++        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
++        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
++        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v7.8b, v2.8b    // a3
++        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v4.4h
++        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v6.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3]
++        dup             v5.8h, w2               // pq
++        trn1            v6.8b, v1.8b, v2.8b
++        trn2            v1.8b, v1.8b, v2.8b
++        trn1            v2.8b, v3.8b, v4.8b
++        trn2            v3.8b, v3.8b, v4.8b
++        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
++        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
++        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
++        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
++        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
++        uxtl            v6.8h, v7.8b            // P2, P6
++        uxtl            v7.8h, v2.8b            // P3, P7
++        uxtl            v1.8h, v1.8b            // P4, P8
++        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
++        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
++        uxtl            v4.8h, v4.8b            // P1, P5
++        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        mov             d6, v6.d[1]             // P6
++        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        mov             d4, v4.d[1]             // P5
++        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
++        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        sub             v7.4h, v1.4h, v4.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        srshr           v3.8h, v3.8h, #3
++        abs             v6.4h, v7.4h
++        sshr            v7.4h, v7.4h, #8        // clip_sign
++        srshr           v2.4h, v2.4h, #3
++        abs             v3.8h, v3.8h            // a1, a2
++        sshr            v6.4h, v6.4h, #1        // clip
++        mov             d16, v3.d[1]            // a2
++        abs             v17.4h, v2.4h           // a0
++        cmeq            v18.4h, v6.4h, #0       // test clip == 0
++        sshr            v2.4h, v2.4h, #8        // a0_sign
++        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
++        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
++        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v16.8b, v3.8b   // a3
++        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v6.4h
++        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v3.8b, v4.8h
++        sqxtun          v2.8b, v1.8h
++        st2             {v2.b, v3.b}[0], [x0], x1
++        st2             {v2.b, v3.b}[1], [x0], x1
++        st2             {v2.b, v3.b}[2], [x0], x1
++        st2             {v2.b, v3.b}[3], [x0]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x0], x1       // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.8b}, [x3], x1       // P1
++        ld1             {v4.8b}, [x3], x1       // P2
++        ld1             {v5.8b}, [x0], x1       // P6
++        ld1             {v6.8b}, [x3], x1       // P3
++        ld1             {v7.8b}, [x0], x1       // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5
++        ushll           v3.8h, v3.8b, #1        // 2*P1
++        ld1             {v17.8b}, [x3]          // P4
++        uxtl            v4.8h, v4.8b            // P2
++        ld1             {v18.8b}, [x0]          // P8
++        uxtl            v5.8h, v5.8b            // P6
++        dup             v19.8h, w2              // pq
++        uxtl            v20.8h, v6.8b           // P3
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v4.8h, v7.8b            // P7
++        ushll           v6.8h, v6.8b, #1        // 2*P3
++        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v7.8h, v17.8b           // P4
++        uxtl            v17.8h, v18.8b          // P8
++        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v1.8h, v1.8b            // P5
++        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v4.8h, v7.8h, v1.8h     // P4-P5
++        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
++        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++        abs             v17.8h, v4.8h
++        sshr            v4.8h, v4.8h, #8        // clip_sign
++        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        sshr            v17.8h, v17.8h, #1      // clip
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v16.8h, v16.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v5.8h, v17.8h, #0       // test clip == 0
++        srshr           v3.8h, v3.8h, #3
++        abs             v16.8h, v16.8h          // a2
++        abs             v3.8h, v3.8h            // a1
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
++        abs             v20.8h, v6.8h           // a0
++        sshr            v6.8h, v6.8h, #8        // a0_sign
++        bsl             v18.16b, v16.16b, v3.16b // a3
++        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
++        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
++        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
++        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v2.16b, v3.16b, v2.16b
++        cmhs            v3.8h, v0.8h, v17.8h
++        and             w0, w0, w2
++        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
++        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v7.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #2
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3], x1
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3]
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        dup             v4.8h, w2               // pq
++        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn1            v7.2s, v6.2s, v3.2s     // P1
++        trn1            v18.2s, v19.2s, v16.2s  // P2
++        trn2            v3.2s, v6.2s, v3.2s     // P5
++        trn2            v6.2s, v19.2s, v16.2s   // P6
++        trn1            v16.2s, v2.2s, v17.2s   // P3
++        trn2            v2.2s, v2.2s, v17.2s    // P7
++        ushll           v7.8h, v7.8b, #1        // 2*P1
++        trn1            v17.2s, v1.2s, v5.2s    // P4
++        ushll           v19.8h, v3.8b, #1       // 2*P5
++        trn2            v1.2s, v1.2s, v5.2s     // P8
++        uxtl            v5.8h, v18.8b           // P2
++        uxtl            v6.8h, v6.8b            // P6
++        uxtl            v18.8h, v16.8b          // P3
++        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v2.8h, v2.8b            // P7
++        ushll           v5.8h, v16.8b, #1       // 2*P3
++        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v16.8h, v17.8b          // P4
++        uxtl            v1.8h, v1.8b            // P8
++        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v2.8h, v3.8b            // P5
++        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v3.8h, v16.8h, v2.8h    // P4-P5
++        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
++        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        abs             v1.8h, v3.8h
++        sshr            v3.8h, v3.8h, #8        // clip_sign
++        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
++        sshr            v1.8h, v1.8h, #1        // clip
++        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v17.8h, v19.8h, #3
++        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v6.8h, v1.8h, #0        // test clip == 0
++        srshr           v7.8h, v7.8h, #3
++        abs             v17.8h, v17.8h          // a2
++        abs             v7.8h, v7.8h            // a1
++        srshr           v5.8h, v5.8h, #3
++        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
++        abs             v19.8h, v5.8h           // a0
++        sshr            v5.8h, v5.8h, #8        // a0_sign
++        bsl             v18.16b, v17.16b, v7.16b // a3
++        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
++        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
++        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
++        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v1.8h
++        and             w5, w2, w3
++        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
++        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v1.8b, v2.8h
++        sqxtun          v0.8b, v16.8h
++        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[4], [x4], x1
++        st2             {v0.b, v1.b}[5], [x4], x1
++        st2             {v0.b, v1.b}[6], [x4], x1
++        st2             {v0.b, v1.b}[7], [x4]
++2:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.16b}, [x0], x1      // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.16b}, [x3], x1      // P1
++        ld1             {v4.16b}, [x3], x1      // P2
++        ld1             {v5.16b}, [x0], x1      // P6
++        ld1             {v6.16b}, [x3], x1      // P3
++        ld1             {v7.16b}, [x0], x1      // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
++        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
++        ld1             {v18.16b}, [x3]         // P4
++        uxtl            v19.8h, v4.8b           // P2[0..7]
++        ld1             {v20.16b}, [x0]         // P8
++        uxtl            v21.8h, v5.8b           // P6[0..7]
++        dup             v22.8h, w2              // pq
++        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
++        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
++        uxtl2           v4.8h, v4.16b           // P2[8..15]
++        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++        uxtl2           v5.8h, v5.16b           // P6[8..15]
++        uxtl            v23.8h, v6.8b           // P3[0..7]
++        uxtl            v24.8h, v7.8b           // P7[0..7]
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
++        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
++        uxtl            v25.8h, v18.8b          // P4[0..7]
++        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
++        uxtl2           v26.8h, v6.16b          // P3[8..15]
++        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl2           v7.8h, v7.16b           // P7[8..15]
++        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
++        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl2           v18.8h, v18.16b         // P4[8..15]
++        uxtl            v23.8h, v20.8b          // P8[0..7]
++        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        uxtl            v24.8h, v1.8b           // P5[0..7]
++        uxtl2           v20.8h, v20.16b         // P8[8..15]
++        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl2           v1.8h, v1.16b           // P5[8..15]
++        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
++        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
++        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
++        abs             v27.8h, v26.8h
++        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
++        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        abs             v28.8h, v7.8h
++        sshr            v27.8h, v27.8h, #1      // clip[0..7]
++        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
++        sshr            v23.8h, v28.8h, #1      // clip[8..15]
++        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
++        srshr           v17.8h, v17.8h, #3
++        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
++        srshr           v16.8h, v16.8h, #3
++        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        abs             v17.8h, v17.8h          // a1[0..7]
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        srshr           v3.8h, v3.8h, #3
++        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v16.8h, v16.8h          // a2[0..7]
++        srshr           v19.8h, v19.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
++        abs             v3.8h, v3.8h            // a1[8..15]
++        srshr           v4.8h, v4.8h, #3
++        abs             v19.8h, v19.8h          // a2[8..15]
++        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
++        abs             v17.8h, v4.8h           // a0[0..7]
++        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
++        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
++        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        abs             v19.8h, v6.8h           // a0[8..15]
++        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
++        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
++        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
++        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
++        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
++        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        cmhs            v19.8h, v3.8h, v27.8h
++        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v16.16b, v20.16b, v17.16b
++        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++        cmtst           v2.2d, v5.2d, v2.2d
++        cmhs            v3.8h, v0.8h, v23.8h
++        mov             w4, v5.s[1]
++        mov             w5, v5.s[3]
++        and             w0, w0, w2
++        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        orr             v2.16b, v7.16b, v2.16b
++        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             w2, w4, w5
++        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        and             w0, w0, w2
++        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        sqxtun          v2.8b, v25.8h
++        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
++        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        sqxtun          v0.8b, v24.8h
++        sqxtun2         v2.16b, v18.8h
++        sqxtun2         v0.16b, v1.8h
++        st1             {v2.16b}, [x3], x1
++        st1             {v0.16b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #3
++        ld1             {v3.8b}, [x3], x1
++        add             x5, x0, x1, lsl #2
++        ld1             {v4.8b}, [x3], x1
++        add             x6, x4, x1, lsl #2
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3], x1
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        ld1             {v2.8b}, [x3], x1
++        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
++        ld1             {v19.8b}, [x3], x1
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        ld1             {v4.8b}, [x3], x1
++        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        ld1             {v21.8b}, [x3], x1
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        ld1             {v6.8b}, [x3], x1
++        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        ld1             {v23.8b}, [x3], x1
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        ld1             {v17.8b}, [x3], x1
++        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
++        ld1             {v25.8b}, [x3]
++        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
++        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
++        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
++        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
++        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
++        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
++        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
++        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
++        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
++        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
++        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
++        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
++        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
++        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
++        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
++        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
++        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
++        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
++        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
++        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
++        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
++        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
++        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
++        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
++        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
++        uxtl            v17.8h, v27.8b          // P2[0..7]
++        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
++        uxtl            v20.8h, v21.8b          // P6[0..7]
++        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
++        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
++        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
++        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
++        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
++        uxtl            v26.8h, v26.8b          // P2[8..15]
++        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
++        uxtl            v17.8h, v18.8b          // P6[8..15]
++        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
++        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
++        uxtl            v28.8h, v7.8b           // P3[0..7]
++        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++        uxtl            v16.8h, v16.8b          // P7[0..7]
++        uxtl            v26.8h, v21.8b          // P3[8..15]
++        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++        uxtl            v22.8h, v22.8b          // P7[8..15]
++        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
++        uxtl            v27.8h, v27.8b          // P4[0..7]
++        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
++        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
++        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
++        uxtl            v4.8h, v18.8b           // P4[8..15]
++        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl            v1.8h, v1.8b            // P8[0..7]
++        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl            v2.8h, v2.8b            // P8[8..15]
++        uxtl            v16.8h, v19.8b          // P5[0..7]
++        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl            v18.8h, v23.8b          // P5[8..15]
++        dup             v19.8h, w2              // pq
++        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
++        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
++        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        abs             v23.8h, v21.8h
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
++        abs             v26.8h, v22.8h
++        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
++        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        sshr            v23.8h, v23.8h, #1      // clip[0..7]
++        sshr            v26.8h, v26.8h, #1      // clip[8..15]
++        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
++        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
++        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
++        srshr           v5.8h, v5.8h, #3
++        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        srshr           v2.8h, v6.8h, #3
++        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        srshr           v6.8h, v24.8h, #3
++        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        abs             v5.8h, v5.8h            // a1[0..7]
++        srshr           v24.8h, v25.8h, #3
++        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        abs             v2.8h, v2.8h            // a2[0..7]
++        abs             v6.8h, v6.8h            // a1[8..15]
++        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v17.8h, v24.8h          // a2[8..15]
++        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
++        srshr           v3.8h, v3.8h, #3
++        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
++        srshr           v7.8h, v7.8h, #3
++        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
++        abs             v2.8h, v3.8h            // a0[8..15]
++        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
++        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
++        abs             v5.8h, v7.8h            // a0[0..7]
++        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
++        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
++        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
++        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
++        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
++        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
++        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w7, v2.s[1]
++        mov             w8, v2.s[3]
++        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        mov             w2, v5.s[1]             // move to gp reg
++        cmhs            v2.8h, v3.8h, v26.8h
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v23.8h
++        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++        and             w9, w7, w8
++        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++        and             w10, w2, w3
++        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        and             w9, w10, w9
++        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
++        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        sqxtun          v2.8b, v4.8h
++        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v27.8h
++        sqxtun          v1.8b, v16.8h
++        sqxtun          v3.8b, v18.8h
++        tbnz            w2, #0, 1f
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f
++        st2             {v0.b, v1.b}[4], [x5], x1
++        st2             {v0.b, v1.b}[5], [x5], x1
++        st2             {v0.b, v1.b}[6], [x5], x1
++        st2             {v0.b, v1.b}[7], [x5]
++2:      tbnz            w7, #0, 3f
++        st2             {v2.b, v3.b}[0], [x4], x1
++        st2             {v2.b, v3.b}[1], [x4], x1
++        st2             {v2.b, v3.b}[2], [x4], x1
++        st2             {v2.b, v3.b}[3], [x4]
++3:      tbnz            w8, #0, 4f
++        st2             {v2.b, v3.b}[4], [x6], x1
++        st2             {v2.b, v3.b}[5], [x6], x1
++        st2             {v2.b, v3.b}[6], [x6], x1
++        st2             {v2.b, v3.b}[7], [x6]
++4:      ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++//   x0 -> source buffer
++//   w1 = max number of bytes to copy
++//   x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++//   w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        // Offset by 80 to screen out cases that are too short for us to handle,
++        // and also make it easy to test for loop termination, or to determine
++        // whether we need an odd number of half-iterations of the loop.
++        subs            w1, w1, #80
++        b.mi            90f
++
++        // Set up useful constants
++        movi            v20.4s, #3, lsl #24
++        movi            v21.4s, #3, lsl #16
++
++        tst             w1, #32
++        b.ne            1f
++
++          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
++          ext             v25.16b, v0.16b, v1.16b, #1
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++          add             w1, w1, #32
++          b               3f
++
++1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
++        ext             v25.16b, v3.16b, v4.16b, #1
++        ext             v26.16b, v3.16b, v4.16b, #2
++        ext             v27.16b, v3.16b, v4.16b, #3
++        ext             v29.16b, v4.16b, v5.16b, #1
++        ext             v30.16b, v4.16b, v5.16b, #2
++        ext             v31.16b, v4.16b, v5.16b, #3
++        bic             v24.16b, v3.16b, v20.16b
++        bic             v25.16b, v25.16b, v20.16b
++        bic             v26.16b, v26.16b, v20.16b
++        bic             v27.16b, v27.16b, v20.16b
++        bic             v28.16b, v4.16b, v20.16b
++        bic             v29.16b, v29.16b, v20.16b
++        bic             v30.16b, v30.16b, v20.16b
++        bic             v31.16b, v31.16b, v20.16b
++        eor             v24.16b, v24.16b, v21.16b
++        eor             v25.16b, v25.16b, v21.16b
++        eor             v26.16b, v26.16b, v21.16b
++        eor             v27.16b, v27.16b, v21.16b
++        eor             v28.16b, v28.16b, v21.16b
++        eor             v29.16b, v29.16b, v21.16b
++        eor             v30.16b, v30.16b, v21.16b
++        eor             v31.16b, v31.16b, v21.16b
++        cmeq            v24.4s, v24.4s, #0
++        cmeq            v25.4s, v25.4s, #0
++        cmeq            v26.4s, v26.4s, #0
++        cmeq            v27.4s, v27.4s, #0
++        // Drop through...
++2:        mov             v0.16b, v5.16b
++          ld1             {v1.16b, v2.16b}, [x0], #32
++        cmeq            v28.4s, v28.4s, #0
++        cmeq            v29.4s, v29.4s, #0
++        cmeq            v30.4s, v30.4s, #0
++        cmeq            v31.4s, v31.4s, #0
++        orr             v24.16b, v24.16b, v25.16b
++        orr             v26.16b, v26.16b, v27.16b
++        orr             v28.16b, v28.16b, v29.16b
++        orr             v30.16b, v30.16b, v31.16b
++          ext             v25.16b, v0.16b, v1.16b, #1
++        orr             v22.16b, v24.16b, v26.16b
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++        orr             v23.16b, v28.16b, v30.16b
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++        orr             v22.16b, v22.16b, v23.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++        addv            s22, v22.4s
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++        mov             w3, v22.s[0]
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++        cbnz            w3, 90f
++        st1             {v3.16b, v4.16b}, [x2], #32
++3:          mov             v3.16b, v2.16b
++            ld1             {v4.16b, v5.16b}, [x0], #32
++          cmeq            v28.4s, v28.4s, #0
++          cmeq            v29.4s, v29.4s, #0
++          cmeq            v30.4s, v30.4s, #0
++          cmeq            v31.4s, v31.4s, #0
++          orr             v24.16b, v24.16b, v25.16b
++          orr             v26.16b, v26.16b, v27.16b
++          orr             v28.16b, v28.16b, v29.16b
++          orr             v30.16b, v30.16b, v31.16b
++            ext             v25.16b, v3.16b, v4.16b, #1
++          orr             v22.16b, v24.16b, v26.16b
++            ext             v26.16b, v3.16b, v4.16b, #2
++            ext             v27.16b, v3.16b, v4.16b, #3
++            ext             v29.16b, v4.16b, v5.16b, #1
++          orr             v23.16b, v28.16b, v30.16b
++            ext             v30.16b, v4.16b, v5.16b, #2
++            ext             v31.16b, v4.16b, v5.16b, #3
++            bic             v24.16b, v3.16b, v20.16b
++            bic             v25.16b, v25.16b, v20.16b
++            bic             v26.16b, v26.16b, v20.16b
++          orr             v22.16b, v22.16b, v23.16b
++            bic             v27.16b, v27.16b, v20.16b
++            bic             v28.16b, v4.16b, v20.16b
++            bic             v29.16b, v29.16b, v20.16b
++            bic             v30.16b, v30.16b, v20.16b
++            bic             v31.16b, v31.16b, v20.16b
++          addv            s22, v22.4s
++            eor             v24.16b, v24.16b, v21.16b
++            eor             v25.16b, v25.16b, v21.16b
++            eor             v26.16b, v26.16b, v21.16b
++            eor             v27.16b, v27.16b, v21.16b
++            eor             v28.16b, v28.16b, v21.16b
++          mov             w3, v22.s[0]
++            eor             v29.16b, v29.16b, v21.16b
++            eor             v30.16b, v30.16b, v21.16b
++            eor             v31.16b, v31.16b, v21.16b
++            cmeq            v24.4s, v24.4s, #0
++            cmeq            v25.4s, v25.4s, #0
++            cmeq            v26.4s, v26.4s, #0
++            cmeq            v27.4s, v27.4s, #0
++          cbnz            w3, 91f
++          st1             {v0.16b, v1.16b}, [x2], #32
++        subs            w1, w1, #64
++        b.pl            2b
++
++90:     add             w0, w1, #80
++        ret
++
++91:     sub             w1, w1, #32
++        b               90b
++endfunc
 --- a/libavcodec/allcodecs.c
 +++ b/libavcodec/allcodecs.c
 @@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
@@ -15223,6 +17087,883 @@
 +        bx          lr
 +
 +endfunc
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+ 
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+ 
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ 
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int rnd);
+ 
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+ 
++    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+     FN_ASSIGN(1, 0);
+     FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
+         vst1.32         {d1[1]},  [r0,:32]
+         bx              lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1[0]}, [r0], r1       @ P5
++        vld1.32         {d2[0]}, [r3], r1       @ P1
++        vld1.32         {d3[0]}, [r3], r1       @ P2
++        vld1.32         {d4[0]}, [r0], r1       @ P6
++        vld1.32         {d5[0]}, [r3], r1       @ P3
++        vld1.32         {d6[0]}, [r0], r1       @ P7
++        vld1.32         {d7[0]}, [r3]           @ P4
++        vld1.32         {d16[0]}, [r0]          @ P8
++        vshll.u8        q9, d1, #1              @ 2*P5
++        vdup.16         d17, r2                 @ pq
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vmovl.u8        q11, d3                 @ P2
++        vmovl.u8        q1, d4                  @ P6
++        vmovl.u8        q12, d5                 @ P3
++        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q11, d6                 @ P7
++        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmovl.u8        q3, d7                  @ P4
++        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q11, d16                @ P8
++        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
++        vmovl.u8        q12, d1                 @ P5
++        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
++        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
++        vsub.i16        d1, d6, d24             @ P4-P5
++        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
++        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
++        vabs.s16        d2, d1
++        vrshr.s16       d3, d18, #3
++        vrshr.s16       d5, d20, #3
++        vshr.s16        d2, d2, #1              @ clip
++        vrshr.s16       d4, d4, #3
++        vabs.s16        d3, d3                  @ a2
++        vshr.s16        d1, d1, #8              @ clip_sign
++        vabs.s16        d5, d5                  @ a1
++        vceq.i16        d7, d2, #0              @ test clip == 0
++        vabs.s16        d16, d4                 @ a0
++        vshr.s16        d4, d4, #8              @ a0_sign
++        vcge.s16        d18, d5, d3             @ test a1 >= a2
++        vcge.s16        d17, d16, d17           @ test a0 >= pq
++        vbsl            d18, d3, d5             @ a3
++        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
++        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d5, d18, d16            @ test a3 >= a0
++        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d4, d0, d2
++        tst             r0, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d4, d2, d0              @ FFMIN(d, clip)
++        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q12
++        vst1.32         {d0[0]}, [r3], r1
++        vst1.32         {d1[0]}, [r3]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3]
++        vdup.16         d1, r2                  @ pq
++        vtrn.8          q1, q2
++        vtrn.16         d2, d3                  @ P1, P5, P3, P7
++        vtrn.16         d4, d5                  @ P2, P6, P4, P8
++        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
++        vmovl.u8        q8, d4                  @ P2, P6
++        vmovl.u8        q9, d3                  @ P3, P7
++        vmovl.u8        q2, d5                  @ P4, P8
++        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
++        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
++        vmovl.u8        q1, d2                  @ P1, P5
++        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
++        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
++        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
++        vsub.i16        d3, d4, d2              @ P4-P5
++        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
++        vrshr.s16       q3, q3, #3
++        vabs.s16        d5, d3
++        vshr.s16        d3, d3, #8              @ clip_sign
++        vrshr.s16       d16, d20, #3
++        vabs.s16        q3, q3                  @ a1, a2
++        vshr.s16        d5, d5, #1              @ clip
++        vabs.s16        d17, d16                @ a0
++        vceq.i16        d18, d5, #0             @ test clip == 0
++        vshr.s16        d16, d16, #8            @ a0_sign
++        vcge.s16        d19, d6, d7             @ test a1 >= a2
++        vcge.s16        d1, d17, d1             @ test a0 >= pq
++        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
++        vbsl            d19, d7, d6             @ a3
++        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
++        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d3[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d3, d0, d5
++        tst             r2, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d3, d5, d0              @ FFMIN(d, clip)
++        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q1
++        vqmovun.s16     d0, q2
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1}, [r0 :64], r1      @ P5
++        vld1.32         {d2}, [r3 :64], r1      @ P1
++        vld1.32         {d3}, [r3 :64], r1      @ P2
++        vld1.32         {d4}, [r0 :64], r1      @ P6
++        vld1.32         {d5}, [r3 :64], r1      @ P3
++        vld1.32         {d6}, [r0 :64], r1      @ P7
++        vshll.u8        q8, d1, #1              @ 2*P5
++        vshll.u8        q9, d2, #1              @ 2*P1
++        vld1.32         {d7}, [r3 :64]          @ P4
++        vmovl.u8        q1, d3                  @ P2
++        vld1.32         {d20}, [r0 :64]         @ P8
++        vmovl.u8        q11, d4                 @ P6
++        vdup.16         q12, r2                 @ pq
++        vmovl.u8        q13, d5                 @ P3
++        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
++        vmovl.u8        q1, d6                  @ P7
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
++        vmovl.u8        q3, d7                  @ P4
++        vmovl.u8        q10, d20                @ P8
++        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
++        vmovl.u8        q1, d1                  @ P5
++        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
++        vsub.i16        q13, q3, q1             @ P4-P5
++        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q10, q13
++        vshr.s16        q13, q13, #8            @ clip_sign
++        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q10, q10, #1            @ clip
++        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q11, q10, #0            @ test clip == 0
++        vrshr.s16       q9, q9, #3
++        vabs.s16        q8, q8                  @ a2
++        vabs.s16        q9, q9                  @ a1
++        vrshr.s16       q2, q2, #3
++        vcge.s16        q14, q9, q8             @ test a1 >= a2
++        vabs.s16        q15, q2                 @ a0
++        vshr.s16        q2, q2, #8              @ a0_sign
++        vbsl            q14, q8, q9             @ a3
++        vcge.s16        q8, q15, q12            @ test a0 >= pq
++        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
++        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q12, q14, q15           @ test a3 >= a0
++        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
++        vshl.i64        q11, q9, #16
++        vmov.32         r0, d18[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r2, d19[1]
++        vshr.s64        q9, q11, #48
++        vcge.s16        q11, q0, q10
++        vorr            q8, q8, q9
++        and             r0, r0, r2
++        vbsl            q11, q10, q0            @ FFMIN(d, clip)
++        tst             r0, #1
++        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
++        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q1
++        vst1.32         {d0}, [r3 :64], r1
++        vst1.32         {d1}, [r3 :64]
++1:      bx              lr
++endfunc
++
++.align  5
++.Lcoeffs:
++.quad   0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        push            {lr}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        add             r12, r0, r1, lsl #2
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d7}, [r3], r1
++        vld1.32         {d17}, [r3]
++        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.32         d2, d6                  @ P1, P5
++        vtrn.32         d4, d16                 @ P2, P6
++        vtrn.32         d3, d7                  @ P3, P7
++        vtrn.32         d5, d17                 @ P4, P8
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vshll.u8        q11, d6, #1             @ 2*P5
++        vmovl.u8        q12, d4                 @ P2
++        vmovl.u8        q13, d16                @ P6
++        vmovl.u8        q14, d3                 @ P3
++        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q12, d7                 @ P7
++        vshll.u8        q1, d3, #1              @ 2*P3
++        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
++        vmovl.u8        q2, d5                  @ P4
++        vmovl.u8        q8, d17                 @ P8
++        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q3, d6                  @ P5
++        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
++        vsub.i16        q12, q2, q3             @ P4-P5
++        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q8, q12
++        vshr.s16        q12, q12, #8            @ clip_sign
++        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q8, q8, #1              @ clip
++        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q11, q11, #3
++        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q13, q8, #0             @ test clip == 0
++        vrshr.s16       q10, q10, #3
++        vabs.s16        q11, q11                @ a2
++        vabs.s16        q10, q10                @ a1
++        vrshr.s16       q1, q1, #3
++        vcge.s16        q14, q10, q11           @ test a1 >= a2
++        vabs.s16        q15, q1                 @ a0
++        vshr.s16        q1, q1, #8              @ a0_sign
++        vbsl            q14, q11, q10           @ a3
++        vcge.s16        q9, q15, q9             @ test a0 >= pq
++        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
++        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q11, q14, q15           @ test a3 >= a0
++        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d20[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r3, d21[1]
++        vcge.s16        q10, q0, q8
++        and             r14, r2, r3
++        vbsl            q10, q8, q0             @ FFMIN(d, clip)
++        tst             r14, #1
++        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q3
++        vqmovun.s16     d0, q2
++        tst             r2, #1
++        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      tst             r3, #1
++        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
++        vst2.8          {d0[4], d1[4]}, [r12], r1
++        vst2.8          {d0[5], d1[5]}, [r12], r1
++        vst2.8          {d0[6], d1[6]}, [r12], r1
++        vst2.8          {d0[7], d1[7]}, [r12]
++2:      pop             {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        vpush           {d8-d15}
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.64         {q1}, [r0 :128], r1     @ P5
++        vld1.64         {q2}, [r3 :128], r1     @ P1
++        vld1.64         {q3}, [r3 :128], r1     @ P2
++        vld1.64         {q4}, [r0 :128], r1     @ P6
++        vld1.64         {q5}, [r3 :128], r1     @ P3
++        vld1.64         {q6}, [r0 :128], r1     @ P7
++        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
++        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
++        vld1.64         {q9}, [r3 :128]         @ P4
++        vmovl.u8        q10, d6                 @ P2[0..7]
++        vld1.64         {q11}, [r0 :128]        @ P8
++        vmovl.u8        q12, d8                 @ P6[0..7]
++        vdup.16         q13, r2                 @ pq
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
++        vmovl.u8        q3, d7                  @ P2[8..15]
++        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q4, d9                  @ P6[8..15]
++        vmovl.u8        q14, d10                @ P3[0..7]
++        vmovl.u8        q15, d12                @ P7[0..7]
++        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
++        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
++        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q6, d13                 @ P7[8..15]
++        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q14, d18                @ P4[0..7]
++        vmovl.u8        q9, d19                 @ P4[8..15]
++        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vmovl.u8        q15, d11                @ P3[8..15]
++        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
++        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q15, d22                @ P8[0..7]
++        vmovl.u8        q11, d23                @ P8[8..15]
++        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q6, d2                  @ P5[0..7]
++        vmovl.u8        q1, d3                  @ P5[8..15]
++        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
++        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q7, q7, #3
++        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vabs.s16        q11, q15
++        vabs.s16        q8, q8                  @ a1[0..7]
++        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
++        vrshr.s16       q2, q2, #3
++        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q7, q7                  @ a2[0..7]
++        vrshr.s16       q10, q10, #3
++        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vrshr.s16       q3, q3, #3
++        vabs.s16        q10, q10                @ a2[8..15]
++        vbsl            q4, q7, q8              @ a3[0..7]
++        vabs.s16        q7, q12
++        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
++        vrshr.s16       q5, q5, #3
++        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
++        vshr.s16        q7, q7, #1              @ clip[8..15]
++        vbsl            q12, q10, q2            @ a3[8..15]
++        vabs.s16        q2, q3                  @ a0[0..7]
++        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
++        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
++        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
++        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
++        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
++        vabs.s16        q4, q5                  @ a0[8..15]
++        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
++        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
++        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
++        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
++        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vmov.32         r2, d5[1]
++        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
++        vshl.i64        q2, q2, #16
++        vcge.s16        q12, q15, q11
++        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vshr.s64        q2, q2, #48
++        and             r0, r0, r2
++        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
++        vshl.i64        q11, q4, #16
++        vmov.32         r2, d8[1]
++        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q10, q2
++        vmov.32         r12, d9[1]
++        vshr.s64        q4, q11, #48
++        vcge.s16        q10, q0, q7
++        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vorr            q4, q8, q4
++        and             r2, r2, r12
++        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
++        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             r0, r0, r2
++        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        tst             r0, #1
++        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
++        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        vqmovun.s16     d4, q14
++        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        vqmovun.s16     d0, q6
++        vqmovun.s16     d5, q9
++        vqmovun.s16     d1, q1
++        vst1.64         {q2}, [r3 :128], r1
++        vst1.64         {q0}, [r3 :128]
++1:      vpop            {d8-d15}
++        bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        push            {r4-r6,lr}
++        vpush           {d8-d15}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d3}, [r3], r1
++        add             r4, r0, r1, lsl #2
++        vld1.32         {d10}, [r3], r1
++        vld1.32         {d11}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d8}, [r3], r1
++        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++        vld1.32         {d14}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d12}, [r3], r1
++        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++        vld1.32         {d13}, [r3], r1
++        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vld1.32         {d1}, [r3], r1
++        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++        vld1.32         {d7}, [r3], r1
++        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vld1.32         {d9}, [r3], r1
++        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++        vld1.32         {d15}, [r3]
++        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
++        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
++        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
++        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
++        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
++        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
++        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++        vmovl.u8        q1, d3                  @ P2[0..7]
++        vmovl.u8        q12, d4                 @ P6[0..7]
++        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
++        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
++        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
++        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
++        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vmovl.u8        q1, d10                 @ P3[0..7]
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
++        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q14, d6                 @ P2[8..15]
++        vmovl.u8        q3, d7                  @ P6[8..15]
++        vmovl.u8        q15, d8                 @ P7[0..7]
++        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q1, d12                 @ P3[8..15]
++        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
++        vmovl.u8        q4, d9                  @ P7[8..15]
++        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
++        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q5, d11                 @ P4[0..7]
++        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
++        vmovl.u8        q6, d13                 @ P4[8..15]
++        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q1, d14                 @ P8[0..7]
++        vmovl.u8        q7, d15                 @ P8[8..15]
++        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q4, d16                 @ P5[0..7]
++        vmovl.u8        q8, d1                  @ P5[8..15]
++        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
++        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q10, q10, #3
++        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
++        vrshr.s16       q11, q11, #3
++        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q2, q2, #3
++        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vabs.s16        q10, q10                @ a1[0..7]
++        vrshr.s16       q13, q13, #3
++        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vabs.s16        q3, q11                 @ a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q11, q1
++        vabs.s16        q12, q13                @ a2[8..15]
++        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
++        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
++        vrshr.s16       q15, q15, #3
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vrshr.s16       q14, q14, #3
++        vbsl            q13, q3, q10            @ a3[0..7]
++        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
++        vabs.s16        q10, q15                @ a0[8..15]
++        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
++        vbsl            q3, q12, q2             @ a3[8..15]
++        vabs.s16        q2, q14                 @ a0[0..7]
++        vabs.s16        q12, q7
++        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
++        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
++        vshr.s16        q12, q12, #1            @ clip[8..15]
++        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
++        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
++        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
++        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
++        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
++        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
++        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
++        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
++        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vcge.s16        q14, q13, q12
++        vmov.32         r2, d4[1]               @ move to gp reg
++        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vmov.32         r3, d5[1]
++        vcge.s16        q2, q0, q11
++        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
++        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
++        vmov.32         r5, d6[1]
++        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmov.32         r6, d7[1]
++        and             r12, r2, r3
++        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        and             r14, r5, r6
++        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        and             r12, r12, r14
++        vqmovun.s16     d4, q6
++        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        tst             r12, #1
++        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
++        vqmovun.s16     d2, q5
++        vqmovun.s16     d3, q4
++        vqmovun.s16     d5, q8
++        tst             r2, #1
++        bne             1f
++        vst2.8          {d2[0], d3[0]}, [r0], r1
++        vst2.8          {d2[1], d3[1]}, [r0], r1
++        vst2.8          {d2[2], d3[2]}, [r0], r1
++        vst2.8          {d2[3], d3[3]}, [r0]
++1:      add             r0, r4, r1, lsl #2
++        tst             r3, #1
++        bne             2f
++        vst2.8          {d2[4], d3[4]}, [r4], r1
++        vst2.8          {d2[5], d3[5]}, [r4], r1
++        vst2.8          {d2[6], d3[6]}, [r4], r1
++        vst2.8          {d2[7], d3[7]}, [r4]
++2:      add             r4, r0, r1, lsl #2
++        tst             r5, #1
++        bne             3f
++        vst2.8          {d4[0], d5[0]}, [r0], r1
++        vst2.8          {d4[1], d5[1]}, [r0], r1
++        vst2.8          {d4[2], d5[2]}, [r0], r1
++        vst2.8          {d4[3], d5[3]}, [r0]
++3:      tst             r6, #1
++        bne             4f
++        vst2.8          {d4[4], d5[4]}, [r4], r1
++        vst2.8          {d4[5], d5[5]}, [r4], r1
++        vst2.8          {d4[6], d5[6]}, [r4], r1
++        vst2.8          {d4[7], d5[7]}, [r4]
++4:      vpop            {d8-d15}
++        pop             {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@   r0 -> source buffer
++@   r1 = max number of bytes to copy
++@   r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@   r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        @ Offset by 48 to screen out cases that are too short for us to handle,
++        @ and also make it easy to test for loop termination, or to determine
++        @ whether we need an odd number of half-iterations of the loop.
++        subs    r1, r1, #48
++        bmi     90f
++
++        @ Set up useful constants
++        vmov.i32        q0, #0x3000000
++        vmov.i32        q1, #0x30000
++
++        tst             r1, #16
++        bne             1f
++
++          vld1.8          {q8, q9}, [r0]!
++          vbic            q12, q8, q0
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++          add             r1, r1, #16
++          b               3f
++
++1:      vld1.8          {q10, q11}, [r0]!
++        vbic            q12, q10, q0
++        vext.8          q13, q10, q11, #1
++        vext.8          q14, q10, q11, #2
++        vext.8          q15, q10, q11, #3
++        veor            q12, q12, q1
++        vbic            q13, q13, q0
++        vbic            q14, q14, q0
++        vbic            q15, q15, q0
++        vceq.i32        q12, q12, #0
++        veor            q13, q13, q1
++        veor            q14, q14, q1
++        veor            q15, q15, q1
++        vceq.i32        q13, q13, #0
++        vceq.i32        q14, q14, #0
++        vceq.i32        q15, q15, #0
++        @ Drop through...
++2:        vmov            q8, q11
++          vld1.8          {q9}, [r0]!
++        vorr            q13, q12, q13
++        vorr            q15, q14, q15
++          vbic            q12, q8, q0
++        vorr            q3, q13, q15
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++        vorr            d6, d6, d7
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++        vmov            r3, r12, d6
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++        orrs            r3, r3, r12
++        bne             90f
++        vst1.64         {q10}, [r2]!
++3:          vmov            q10, q9
++            vld1.8          {q11}, [r0]!
++          vorr            q13, q12, q13
++          vorr            q15, q14, q15
++            vbic            q12, q10, q0
++          vorr            q3, q13, q15
++            vext.8          q13, q10, q11, #1
++            vext.8          q14, q10, q11, #2
++            vext.8          q15, q10, q11, #3
++            veor            q12, q12, q1
++          vorr            d6, d6, d7
++            vbic            q13, q13, q0
++            vbic            q14, q14, q0
++            vbic            q15, q15, q0
++            vceq.i32        q12, q12, #0
++          vmov            r3, r12, d6
++            veor            q13, q13, q1
++            veor            q14, q14, q1
++            veor            q15, q15, q1
++            vceq.i32        q13, q13, #0
++            vceq.i32        q14, q14, #0
++            vceq.i32        q15, q15, #0
++          orrs            r3, r3, r12
++          bne             91f
++          vst1.64         {q8}, [r2]!
++        subs            r1, r1, #32
++        bpl             2b
++
++90:     add             r0, r1, #48
++        bx              lr
++
++91:     sub             r1, r1, #16
++        b               90b
++endfunc
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
 @@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
@@ -15773,6 +18514,264 @@
 +};
 +
 +#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
 --- a/libavcodec/hevc_parser.c
 +++ b/libavcodec/hevc_parser.c
 @@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
@@ -15795,6 +18794,63 @@
      if (ps->vps->vps_timing_info_present_flag) {
          num = ps->vps->vps_num_units_in_tick;
          den = ps->vps->vps_time_scale;
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
+         if (!frame->rpl_buf)
+             goto fail;
+ 
+-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+-        if (!frame->tab_mvf_buf)
+-            goto fail;
+-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        if (s->tab_mvf_pool) {
++            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++            if (!frame->tab_mvf_buf)
++                goto fail;
++            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        }
+ 
+-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+-        if (!frame->rpl_tab_buf)
+-            goto fail;
+-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+-        for (j = 0; j < frame->ctb_count; j++)
+-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        if (s->rpl_tab_pool) {
++            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++            if (!frame->rpl_tab_buf)
++                goto fail;
++            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++            for (j = 0; j < frame->ctb_count; j++)
++                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        }
+ 
+         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
+     int ctb_count    = frame->ctb_count;
+     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+     int i;
++    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+ 
+     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+         return AVERROR_INVALIDDATA;
+ 
+-    for (i = ctb_addr_ts; i < ctb_count; i++)
+-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++    if (frame->rpl_tab) {
++        for (i = ctb_addr_ts; i < ctb_count; i++)
++            frame->rpl_tab[i] = tab;
++    }
+ 
+-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++    frame->refPicList = tab->refPicList;
+ 
+     return 0;
+ }
 --- a/libavcodec/hevcdec.c
 +++ b/libavcodec/hevcdec.c
 @@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
@@ -15863,7 +18919,43 @@
          break;
      case AV_PIX_FMT_YUV444P:
  #if CONFIG_HEVC_VDPAU_HWACCEL
-@@ -3230,7 +3258,14 @@ static int hevc_decode_frame(AVCodecCont
+@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
+     if (!sps)
+         return 0;
+ 
++    // If hwaccel then we don't need all the s/w decode helper arrays
++    if (s->avctx->hwaccel) {
++        export_stream_params(s, sps);
++
++        s->avctx->pix_fmt = pix_fmt;
++        s->ps.sps = sps;
++        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++        return 0;
++    }
++
+     ret = pic_arrays_init(s, sps);
+     if (ret < 0)
+         goto fail;
+@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
+                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+     int ret;
+ 
+-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    if (s->horizontal_bs) {
++        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    }
+ 
+     s->is_decoded        = 0;
+     s->first_nal_type    = s->nal_unit_type;
+@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
      s->ref = NULL;
      ret    = decode_nal_units(s, avpkt->data, avpkt->size);
      if (ret < 0)
@@ -15878,7 +18970,35 @@
  
      if (avctx->hwaccel) {
          if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-@@ -3585,6 +3620,15 @@ AVCodec ff_hevc_decoder = {
+@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
+     if (ret < 0)
+         return ret;
+ 
+-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+-    if (!dst->tab_mvf_buf)
+-        goto fail;
+-    dst->tab_mvf = src->tab_mvf;
++    if (src->tab_mvf_buf) {
++        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++        if (!dst->tab_mvf_buf)
++            goto fail;
++        dst->tab_mvf = src->tab_mvf;
++    }
+ 
+-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+-    if (!dst->rpl_tab_buf)
+-        goto fail;
+-    dst->rpl_tab = src->rpl_tab;
++    if (src->rpl_tab_buf) {
++        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++        if (!dst->rpl_tab_buf)
++            goto fail;
++        dst->rpl_tab = src->rpl_tab;
++    }
+ 
+     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+     if (!dst->rpl_buf)
+@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
  #if CONFIG_HEVC_NVDEC_HWACCEL
                                 HWACCEL_NVDEC(hevc),
  #endif
@@ -46121,9 +49241,11 @@
  #include <linux/videodev2.h>
  #include <sys/ioctl.h>
  #include <sys/mman.h>
-@@ -30,12 +31,14 @@
+@@ -29,57 +30,82 @@
+ #include <poll.h>
  #include "libavcodec/avcodec.h"
  #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
  #include "libavutil/pixdesc.h"
 +#include "libavutil/hwcontext.h"
  #include "v4l2_context.h"
@@ -46135,16 +49257,29 @@
 -static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
 +static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
  
- static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
  {
-@@ -52,34 +55,44 @@ static inline AVCodecContext *logger(V4L
- static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
+     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+         container_of(buf->context, V4L2m2mContext, output) :
+         container_of(buf->context, V4L2m2mContext, capture);
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
  {
-     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+     return buf_to_m2mctx(buf)->avctx;
+ }
+ 
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
 -
 -    if (s->avctx->pkt_timebase.num)
 -        return s->avctx->pkt_timebase;
 -    return s->avctx->time_base;
++    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
 +    const AVRational tb = s->avctx->pkt_timebase.num ?
 +        s->avctx->pkt_timebase :
 +        s->avctx->time_base;
@@ -46152,40 +49287,53 @@
  }
  
 -static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
-+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
++static inline struct timeval tv_from_int(const int64_t t)
  {
 -    int64_t v4l2_pts;
--
++    return (struct timeval){
++        .tv_usec = t % USEC_PER_SEC,
++        .tv_sec  = t / USEC_PER_SEC
++    };
++}
+ 
 -    if (pts == AV_NOPTS_VALUE)
 -        pts = 0;
--
++static inline int64_t int_from_tv(const struct timeval t)
++{
++    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+ 
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
      /* convert pts to v4l2 timebase */
 -    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 +    const int64_t v4l2_pts =
-+        no_rescale ? pts :
 +        pts == AV_NOPTS_VALUE ? 0 :
 +            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++    out->buf.timestamp = tv_from_int(v4l2_pts);
  }
  
 -static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
-+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
  {
 -    int64_t v4l2_pts;
 -
++    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
      /* convert pts back to encoder timebase */
 -    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                         avbuf->buf.timestamp.tv_usec;
- 
--    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+-                        avbuf->buf.timestamp.tv_usec;
 +    return
-+        no_rescale ? v4l2_pts :
++        avbuf->context->no_pts_rescale ? v4l2_pts :
 +        v4l2_pts == 0 ? AV_NOPTS_VALUE :
 +            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
 +}
-+
+ 
+-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
 +static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
 +{
 +    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
@@ -46198,7 +49346,7 @@
  }
  
  static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-@@ -116,6 +129,105 @@ static enum AVColorPrimaries v4l2_get_co
+@@ -116,6 +142,105 @@ static enum AVColorPrimaries v4l2_get_co
      return AVCOL_PRI_UNSPECIFIED;
  }
  
@@ -46304,7 +49452,7 @@
  static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
  {
      enum v4l2_quantization qt;
-@@ -134,6 +246,20 @@ static enum AVColorRange v4l2_get_color_
+@@ -134,6 +259,20 @@ static enum AVColorRange v4l2_get_color_
       return AVCOL_RANGE_UNSPECIFIED;
  }
  
@@ -46325,7 +49473,7 @@
  static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
  {
      enum v4l2_ycbcr_encoding ycbcr;
-@@ -210,73 +336,165 @@ static enum AVColorTransferCharacteristi
+@@ -210,73 +349,165 @@ static enum AVColorTransferCharacteristi
      return AVCOL_TRC_UNSPECIFIED;
  }
  
@@ -46472,7 +49620,7 @@
 -    ret = v4l2_buf_increase_ref(in);
 -    if (ret)
 -        av_buffer_unref(buf);
-+        avbuf->status = V4L2BUF_AVAILABLE;
++        ff_v4l2_buffer_set_avail(avbuf);
  
 -    return ret;
 +        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
@@ -46494,8 +49642,9 @@
 +
 +    ff_weak_link_unlock(avbuf->context_wl);
 +    av_buffer_unref(&bufref);
-+}
-+
+ }
+ 
+-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
 +static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 +{
 +    struct v4l2_exportbuffer expbuf;
@@ -46526,9 +49675,8 @@
 +    }
 +
 +    return 0;
- }
- 
--static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
++}
++
 +static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
  {
      unsigned int bytesused, length;
@@ -46536,7 +49684,7 @@
  
      if (plane >= out->num_planes)
          return AVERROR(EINVAL);
-@@ -284,32 +502,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
+@@ -284,32 +515,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
      length = out->plane_info[plane].length;
      bytesused = FFMIN(size+offset, length);
  
@@ -46591,7 +49739,7 @@
 +    frame->buf[0] = wrap_avbuf(avbuf);
 +    if (frame->buf[0] == NULL)
 +        return AVERROR(ENOMEM);
- 
++
 +    if (buf_to_m2mctx(avbuf)->output_drm) {
 +        /* 1. get references to the actual data */
 +        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
@@ -46599,7 +49747,7 @@
 +        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
 +        return 0;
 +    }
-+
+ 
 +
 +    /* 1. get references to the actual data */
 +    for (i = 0; i < avbuf->num_planes; i++) {
@@ -46609,7 +49757,7 @@
      }
  
      /* fixup special cases */
-@@ -318,17 +561,17 @@ static int v4l2_buffer_buf_to_swframe(AV
+@@ -318,17 +574,17 @@ static int v4l2_buffer_buf_to_swframe(AV
      case AV_PIX_FMT_NV21:
          if (avbuf->num_planes > 1)
              break;
@@ -46633,7 +49781,7 @@
          break;
  
      default:
-@@ -338,68 +581,95 @@ static int v4l2_buffer_buf_to_swframe(AV
+@@ -338,68 +594,127 @@ static int v4l2_buffer_buf_to_swframe(AV
      return 0;
  }
  
@@ -46655,6 +49803,38 @@
 +{
 +    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
 +}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++        return AVERROR(EINVAL);
++
++    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        // Only currently cope with single buffer types
++        if (out->buf.length != 1)
++            return AVERROR_PATCHWELCOME;
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->planes[0].m.fd = src->objects[0].fd;
++    }
++    else {
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->buf.m.fd      = src->objects[0].fd;
++    }
++
++    // No need to copy src AVDescriptor and if we did then we may confuse
++    // fd close on free
++    out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++    return 0;
++}
 +
  static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  {
@@ -46783,37 +49963,55 @@
      return 0;
  }
  
-@@ -411,14 +681,22 @@ static int v4l2_buffer_swframe_to_buf(co
+@@ -409,16 +724,31 @@ static int v4l2_buffer_swframe_to_buf(co
+  *
+  ******************************************************************************/
  
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
  {
 -    v4l2_set_pts(out, frame->pts);
-+    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+-
+-    return v4l2_buffer_swframe_to_buf(frame, out);
++    out->buf.flags = frame->key_frame ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
 +    // Beware that colour info is held in format rather than the actual
 +    // v4l2 buffer struct so this may not be as useful as you might hope
 +    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
 +    v4l2_set_color_range(out, frame->color_range);
 +    // PTS & interlace are buffer vars
-+    v4l2_set_pts(out, frame->pts, 0);
++    if (track_ts)
++        out->buf.timestamp = tv_from_int(track_ts);
++    else
++        v4l2_set_pts(out, frame->pts);
 +    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
- 
-     return v4l2_buffer_swframe_to_buf(frame, out);
++
++    return frame->format == AV_PIX_FMT_DRM_PRIME ?
++        v4l2_buffer_primeframe_to_buf(frame, out) :
++        v4l2_buffer_swframe_to_buf(frame, out);
  }
  
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
  {
      int ret;
 +    V4L2Context * const ctx = avbuf->context;
  
      av_frame_unref(frame);
  
-@@ -433,13 +711,24 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+@@ -429,17 +759,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+ 
+     /* 2. get frame information */
+     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++            AV_PICTURE_TYPE_NONE;
+     frame->color_primaries = v4l2_get_color_primaries(avbuf);
      frame->colorspace = v4l2_get_color_space(avbuf);
      frame->color_range = v4l2_get_color_range(avbuf);
      frame->color_trc = v4l2_get_color_trc(avbuf);
--    frame->pts = v4l2_get_pts(avbuf);
-+    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
+     frame->pts = v4l2_get_pts(avbuf);
      frame->pkt_dts = AV_NOPTS_VALUE;
 +    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
 +    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
@@ -46837,13 +50035,12 @@
  
      /* 3. report errors upstream */
      if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
-@@ -452,15 +741,16 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+@@ -452,15 +797,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
  
  int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
  {
 -    int ret;
-+    av_log(logger(avbuf), AV_LOG_INFO, "%s\n", __func__);
- 
+-
      av_packet_unref(pkt);
 -    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
 -    if (ret)
@@ -46856,22 +50053,18 @@
      pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
 -    pkt->data = pkt->buf->data;
 +    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++    pkt->flags = 0;
  
      if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
          pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -470,36 +760,89 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
-         pkt->flags |= AV_PKT_FLAG_CORRUPT;
-     }
- 
--    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
-+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
- 
+@@ -475,31 +820,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
      return 0;
  }
  
 -int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp)
  {
      int ret;
  
@@ -46888,16 +50081,21 @@
          return ret;
  
 -    v4l2_set_pts(out, pkt->pts);
-+    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
++    if (timestamp)
++        out->buf.timestamp = tv_from_int(timestamp);
++    else
++        v4l2_set_pts(out, pkt->pts);
++
++    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
  
-     if (pkt->flags & AV_PKT_FLAG_KEY)
-         out->flags = V4L2_BUF_FLAG_KEYFRAME;
+-    if (pkt->flags & AV_PKT_FLAG_KEY)
+-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
++    return ret;
++}
  
 -    return 0;
-+    return ret;
- }
- 
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
 +int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 +{
 +    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
@@ -46920,13 +50118,16 @@
 +            close(avbuf->drm_frame.objects[i].fd);
 +    }
 +
++    av_buffer_unref(&avbuf->ref_buf);
++
 +    ff_weak_link_unref(&avbuf->context_wl);
 +
 +    av_free(avbuf);
-+}
+ }
+ 
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
 +
-+
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
  {
 -    V4L2Context *ctx = avbuf->context;
      int ret, i;
@@ -46936,15 +50137,16 @@
 +    *pbufref = NULL;
 +    if (avbuf == NULL)
 +        return AVERROR(ENOMEM);
- 
++
 +    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
 +    if (bufref == NULL) {
 +        av_free(avbuf);
 +        return AVERROR(ENOMEM);
 +    }
-+
+ 
+-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
 +    avbuf->context = ctx;
-     avbuf->buf.memory = V4L2_MEMORY_MMAP;
++    avbuf->buf.memory = mem;
      avbuf->buf.type = ctx->type;
      avbuf->buf.index = index;
  
@@ -46957,7 +50159,7 @@
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.length = VIDEO_MAX_PLANES;
          avbuf->buf.m.planes = avbuf->planes;
-@@ -507,7 +850,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+@@ -507,7 +912,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
  
      ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
      if (ret < 0)
@@ -46966,7 +50168,16 @@
  
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->num_planes = 0;
-@@ -527,25 +870,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+@@ -520,6 +925,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+         avbuf->num_planes = 1;
+ 
+     for (i = 0; i < avbuf->num_planes; i++) {
++        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+ 
+         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +934,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
  
          if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
              avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
@@ -46974,24 +50185,20 @@
 -                                           PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+            }
          } else {
              avbuf->plane_info[i].length = avbuf->buf.length;
 -            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 -                                          PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+            }
          }
  
 -        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
@@ -47011,7 +50218,7 @@
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.m.planes = avbuf->planes;
          avbuf->buf.length   = avbuf->num_planes;
-@@ -555,7 +906,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+@@ -555,20 +966,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
          avbuf->buf.length    = avbuf->planes[0].length;
      }
  
@@ -47033,17 +50240,18 @@
  }
  
  int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-@@ -564,9 +928,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* a
- 
-     avbuf->buf.flags = avbuf->flags;
+ {
+     int ret;
++    int qc;
  
+-    avbuf->buf.flags = avbuf->flags;
 +    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
 +        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
 +               avbuf->context->name, avbuf->buf.index,
 +               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
 +               avbuf->context->q_count);
 +    }
-+
+ 
      ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
 -    if (ret < 0)
 -        return AVERROR(errno);
@@ -47054,18 +50262,21 @@
 +               err, strerror(err));
 +        return AVERROR(err);
 +    }
+ 
++    // Lock not wanted - if called from buffer free then lock already obtained
++    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+     avbuf->status = V4L2BUF_IN_DRIVER;
++    pthread_cond_broadcast(&avbuf->context->cond);
 +
-+    ++avbuf->context->q_count;
 +    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
 +           avbuf->context->name, avbuf->buf.index,
-+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-+           avbuf->context->q_count);
- 
-     avbuf->status = V4L2BUF_IN_DRIVER;
++           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
  
+     return 0;
+ }
 --- a/libavcodec/v4l2_buffers.h
 +++ b/libavcodec/v4l2_buffers.h
-@@ -27,25 +27,34 @@
+@@ -27,25 +27,38 @@
  #include <stdatomic.h>
  #include <linux/videodev2.h>
  
@@ -47102,42 +50313,63 @@
 -    atomic_uint context_refcount;
 +    /* DRM descriptor */
 +    AVDRMFrameDescriptor drm_frame;
++    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++     * are done
++     */
++    AVBufferRef * ref_buf;
  
      /* keep track of the mmap address and mmap length */
      struct V4L2Plane_info {
-@@ -70,11 +79,12 @@ typedef struct V4L2Buffer {
-  *
-  * @param[in] frame The AVFRame to push the information to
-  * @param[in] buf The V4L2Buffer to get the information from
-+ * @param[in] no_rescale_pts If non-zero do not rescale PTS
-  *
-  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
-  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
-  */
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
+@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
+     struct v4l2_buffer buf;
+     struct v4l2_plane planes[VIDEO_MAX_PLANES];
  
- /**
-  * Extracts the data from a V4L2Buffer to an AVPacket
-@@ -98,6 +108,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+-    int flags;
+     enum V4L2Buffer_status status;
+ 
+ } V4L2Buffer;
+@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
   */
  int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
  
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts);
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp);
 +
  /**
   * Extracts the data from an AVFrame to a V4L2Buffer
   *
-@@ -116,7 +129,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
+@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+ 
+ /**
+  * Initializes a V4L2Buffer
+@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
   *
   * @returns 0 in case of success, a negative AVERROR code otherwise
   */
 -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
  
  /**
   * Enqueues a V4L2Buffer
+@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+  */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+ 
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++    avbuf->status = V4L2BUF_AVAILABLE;
++    av_buffer_unref(&avbuf->ref_buf);
++}
++
+ 
+ #endif // AVCODEC_V4L2_BUFFERS_H
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -47154,35 +50386,227 @@
  
  struct v4l2_format_update {
      uint32_t v4l2_fmt;
-@@ -53,16 +55,6 @@ static inline AVCodecContext *logger(V4L
-     return ctx_to_m2mctx(ctx)->avctx;
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+     int update_avfmt;
+ };
+ 
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+ {
+-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+-        container_of(ctx, V4L2m2mContext, output) :
+-        container_of(ctx, V4L2m2mContext, capture);
++    return (int64_t)n;
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+-    return ctx_to_m2mctx(ctx)->avctx;
++    return (unsigned int)pts;
  }
  
 -static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
--}
--
--static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
--}
--
- static AVRational v4l2_get_sar(V4L2Context *ctx)
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
  {
-     struct AVRational sar = { 0, 1 };
-@@ -94,8 +86,8 @@ static inline unsigned int v4l2_resoluti
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = avpkt->size,
++        .pts              = avpkt->pts,
++        .dts              = avpkt->dts,
++        .reordered_opaque = avctx->reordered_opaque,
++        .pkt_pos          = avpkt->pos,
++        .pkt_duration     = avpkt->duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
+ }
+ 
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = 0,
++        .pts              = frame->pts,
++        .dts              = AV_NOPTS_VALUE,
++        .reordered_opaque = frame->reordered_opaque,
++        .pkt_pos          = frame->pkt_pos,
++        .pkt_duration     = frame->pkt_duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVFrame *const frame)
++{
++    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++    {
++        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        frame->pts              = AV_NOPTS_VALUE;
++        frame->pkt_dts          = AV_NOPTS_VALUE;
++        frame->reordered_opaque = x->last_opaque;
++        frame->pkt_pos          = -1;
++        frame->pkt_duration     = 0;
++        frame->pkt_size         = -1;
++    }
++    else if (!t->discard)
++    {
++        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
++        frame->pkt_dts          = t->dts;
++        frame->reordered_opaque = t->reordered_opaque;
++        frame->pkt_pos          = t->pkt_pos;
++        frame->pkt_duration     = t->pkt_duration;
++        frame->pkt_size         = t->pkt_size;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (frame->pts != AV_NOPTS_VALUE)
++            x->last_pts = frame->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        return -1;
++    }
++
++    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++    return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVPacket *const pkt)
++{
++    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++    {
++        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        pkt->pts                = AV_NOPTS_VALUE;
++    }
++    else if (!t->discard)
++    {
++        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (pkt->pts != AV_NOPTS_VALUE)
++            x->last_pts = pkt->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        return -1;
++    }
++
++    // * Would like something much better than this...xlat(offset + out_count)?
++    pkt->dts = pkt->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           pkt->pts, t->track_pts, n);
++    return 0;
++}
++
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++{
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++    return ctx_to_m2mctx(ctx)->avctx;
+ }
+ 
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
+     return sar;
+ }
+ 
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+-    struct v4l2_format *fmt1 = &ctx->format;
+-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+-        :
+-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++    const struct v4l2_format *fmt1 = &ctx->format;
++    int ret = !ctx_buffers_alloced(ctx) ||
++        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++            :
++            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+ 
      if (ret)
-         av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
              ctx->name,
 -            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
 -            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++            ctx_buffers_alloced(ctx),
 +            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
 +            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
  
      return ret;
  }
-@@ -153,58 +145,67 @@ static inline void v4l2_save_to_context(
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
      }
  }
  
@@ -47227,23 +50651,18 @@
 +static int do_source_change(V4L2m2mContext * const s)
 +{
 +    AVCodecContext *const avctx = s->avctx;
-+
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+-    if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+-        return 0;
+-    }
 +    int ret;
 +    int reinit;
-+    int full_reinit;
 +    struct v4l2_format cap_fmt = s->capture.format;
-+    struct v4l2_format out_fmt = s->output.format;
 +
-+    s->resize_pending = 0;
 +    s->capture.done = 0;
  
-     ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
-     if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
-+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->output.name);
-         return 0;
-     }
- 
      ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
      if (ret) {
 -        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
@@ -47251,19 +50670,20 @@
          return 0;
      }
  
-     full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
-     if (full_reinit) {
+-    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+-    if (full_reinit) {
 -        s->output.height = v4l2_get_height(&out_fmt);
 -        s->output.width = v4l2_get_width(&out_fmt);
 -        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-+        s->output.height = ff_v4l2_get_format_height(&out_fmt);
-+        s->output.width = ff_v4l2_get_format_width(&out_fmt);
-     }
-+    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-+
+-    }
 +    get_default_selection(&s->capture, &s->capture.selection);
++
++    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++        reinit = 1;
  
-     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++    s->capture.format = cap_fmt;
      if (reinit) {
 -        s->capture.height = v4l2_get_height(&cap_fmt);
 -        s->capture.width = v4l2_get_width(&cap_fmt);
@@ -47271,30 +50691,38 @@
 +        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
 +        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
      }
-+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
-+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
-+           s->capture.selection.width, s->capture.selection.height,
-+           s->capture.selection.left, s->capture.selection.top);
  
-     if (full_reinit || reinit)
-         s->reinit = 1;
-@@ -212,34 +213,88 @@ static int v4l2_handle_event(V4L2Context
-     if (full_reinit) {
-         ret = ff_v4l2_m2m_codec_full_reinit(s);
-         if (ret) {
+-    if (full_reinit || reinit)
+-        s->reinit = 1;
+-
+-    if (full_reinit) {
+-        ret = ff_v4l2_m2m_codec_full_reinit(s);
+-        if (ret) {
 -            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
-+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit failed\n");
-             return AVERROR(EINVAL);
-         }
-         goto reinit_run;
+-            return AVERROR(EINVAL);
+-        }
+-        goto reinit_run;
++    // If we don't support selection (or it is bust) and we obviously have HD then kludge
++    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++        (s->capture.height == 1088 && s->capture.width == 1920)) {
++        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
      }
  
++    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++
++    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
++           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++           s->capture.width, s->capture.height,
++           s->capture.selection.width, s->capture.selection.height,
++           s->capture.selection.left, s->capture.selection.top, reinit);
++
      if (reinit) {
 -        if (s->avctx)
+-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
 +        if (avctx)
-             ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
++            ret = ff_set_dimensions(s->avctx,
++                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
          if (ret < 0)
 -            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
 +            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
@@ -47305,13 +50733,32 @@
 +            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
              return AVERROR(EINVAL);
          }
++
++        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++                   s->capture.width, s->capture.height,
++                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++            return AVERROR(EINVAL);
++        }
++
++        // Update pixel format - should only actually do something on initial change
++        s->capture.av_pix_fmt =
++            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++        if (s->output_drm) {
++            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++        }
++        else
++            avctx->pix_fmt = s->capture.av_pix_fmt;
++
          goto reinit_run;
      }
  
 -    /* dummy event received */
 -    return 0;
 +    /* Buffers are OK so just stream off to ack */
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
++    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
 +
 +    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
 +    if (ret)
@@ -47324,228 +50771,275 @@
      return 1;
  }
  
-+static int ctx_done(V4L2Context * const ctx)
-+{
-+    int rv = 0;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+
-+    ctx->done = 1;
-+
-+    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        rv = do_source_change(s);
-+
-+    return rv;
-+}
-+
-+/**
-+ * handle resolution change event and end of stream event
-+ * returns 1 if reinit was successful, negative if it failed
-+ * returns 0 if reinit was not executed
-+ */
-+static int v4l2_handle_event(V4L2Context *ctx)
-+{
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    struct v4l2_event evt = { 0 };
-+    int ret;
-+
-+    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
-+    if (ret < 0) {
-+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
-+        return 0;
-+    }
-+
-+    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
-+
-+    if (evt.type == V4L2_EVENT_EOS) {
-+//        ctx->done = 1;
-+        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
-+        return 0;
-+    }
-+
-+    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-+        return 0;
-+
-+    s->resize_pending = 1;
-+    if (!ctx->done)
-+        return 0;
-+
-+    return do_source_change(s);
-+}
-+
- static int v4l2_stop_decode(V4L2Context *ctx)
- {
-     struct v4l2_decoder_cmd cmd = {
-@@ -280,8 +335,26 @@ static int v4l2_stop_encode(V4L2Context
+@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
      return 0;
  }
  
-+static int count_in_driver(const V4L2Context * const ctx)
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+-{
+-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+-    struct v4l2_buffer buf = { 0 };
+-    V4L2Buffer *avbuf;
+-    struct pollfd pfd = {
+-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+-        .fd = ctx_to_m2mctx(ctx)->fd,
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++//  0               Success
++//  AVERROR(EPIPE)  Nothing more to read
++//  AVERROR(ENOSPC) No buffers in Q to put result in
++//  *               AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
 +{
-+    int i;
-+    int n = 0;
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    V4L2Buffer * avbuf;
++    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
 +
-+    if (!ctx->bufrefs)
-+        return -1;
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
 +
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (avbuf->status == V4L2BUF_IN_DRIVER)
-+            ++n;
-+    }
-+    return n;
-+}
-+
- static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
- {
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
-     struct v4l2_buffer buf = { 0 };
-     V4L2Buffer *avbuf;
-@@ -290,50 +363,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(
-         .fd = ctx_to_m2mctx(ctx)->fd,
++    struct v4l2_buffer buf = {
++        .type = ctx->type,
++        .memory = V4L2_MEMORY_MMAP,
      };
-     int i, ret;
-+    int no_rx_means_done = 0;
+-    int i, ret;
  
 -    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
-+    if (is_capture && ctx->bufrefs) {
-         for (i = 0; i < ctx->num_buffers; i++) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
 -            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 break;
-         }
-         if (i == ctx->num_buffers)
+-                break;
+-        }
+-        if (i == ctx->num_buffers)
 -            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
-+            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
-                                                 "userspace. Increase num_capture_buffers "
-                                                 "to prevent device deadlock or dropped "
+-                                                "userspace. Increase num_capture_buffers "
+-                                                "to prevent device deadlock or dropped "
 -                                                "packets/frames.\n");
-+                                                "packets/frames.\n", i);
-     }
- 
-+#if 0
-+    // I think this is true but pointless
-+    // we will get some other form of EOF signal
-+
-     /* if we are draining and there are no more capture buffers queued in the driver we are done */
+-    }
+-
+-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
 -    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-+    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
-         for (i = 0; i < ctx->num_buffers; i++) {
-             /* capture buffer initialization happens during decode hence
-              * detection happens at runtime
-              */
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            /* capture buffer initialization happens during decode hence
+-             * detection happens at runtime
+-             */
 -            if (!ctx->buffers)
-+            if (!ctx->bufrefs)
-                 break;
- 
+-                break;
+-
 -            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 goto start;
-         }
-         ctx->done = 1;
-         return NULL;
-     }
-+#endif
- 
- start:
+-                goto start;
+-        }
+-        ctx->done = 1;
+-        return NULL;
+-    }
+-
+-start:
 -    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
 -        pfd.events =  POLLOUT | POLLWRNORM;
 -    else {
-+    if (is_capture) {
-         /* no need to listen to requests for more input while draining */
+-        /* no need to listen to requests for more input while draining */
 -        if (ctx_to_m2mctx(ctx)->draining)
-+        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
-             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-+    } else {
-+        pfd.events =  POLLOUT | POLLWRNORM;
+-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
++    *ppavbuf = NULL;
++
++    if (ctx->flag_last)
++        return AVERROR(EPIPE);
++
++    if (is_mp) {
++        buf.length = VIDEO_MAX_PLANES;
++        buf.m.planes = planes;
      }
-+    no_rx_means_done = s->resize_pending && is_capture;
  
-     for (;;) {
+-    for (;;) {
 -        ret = poll(&pfd, 1, timeout);
-+        // If we have a resize pending then all buffers should be Qed
-+        // With a resize pending we should be in drain but evidence suggests
-+        // that not all decoders do this so poll to clear
-+        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
-+        const int e = pfd.events;
-+
-+        ret = poll(&pfd, 1, t2);
-+
-         if (ret > 0)
-             break;
+-        if (ret > 0)
+-            break;
 -        if (errno == EINTR)
 -            continue;
+-        return NULL;
++    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++        const int err = errno;
++        av_assert0(AVERROR(err) < 0);
++        if (err != EINTR) {
++            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++                ctx->name, av_err2str(AVERROR(err)));
 +
-+        if (ret < 0) {
-+            int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
-+                   err, strerror(err),
-+                   e, count_in_driver(ctx));
-+            return NULL;
-+        }
++            if (err == EPIPE)
++                ctx->flag_last = 1;
 +
-+        // ret == 0 (timeout)
-+        if (no_rx_means_done) {
-+            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
-+            ret = ctx_done(ctx);
-+            if (ret > 0)
-+                goto start;
++            return AVERROR(err);
 +        }
-+        if (timeout == -1)
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
-         return NULL;
+     }
++    atomic_fetch_sub(&ctx->q_count, 1);
+ 
+-    /* 0. handle errors */
+-    if (pfd.revents & POLLERR) {
+-        /* if we are trying to get free buffers but none have been queued yet
+-           no need to raise a warning */
+-        if (timeout == 0) {
+-            for (i = 0; i < ctx->num_buffers; i++) {
+-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+-            }
++    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++    ff_v4l2_buffer_set_avail(avbuf);
++    avbuf->buf = buf;
++    if (is_mp) {
++        memcpy(avbuf->planes, planes, sizeof(planes));
++        avbuf->buf.m.planes = avbuf->planes;
++    }
++    // Done with any attached buffer
++    av_buffer_unref(&avbuf->ref_buf);
++
++    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++        // Zero length cap buffer return == EOS
++        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++            // Must reQ so we don't leak
++            // May not matter if the next thing we do is release all the
++            // buffers but better to be tidy.
++            ff_v4l2_buffer_enqueue(avbuf);
++
++            ctx->flag_last = 1;
++            return AVERROR(EPIPE);
+         }
+-        else
+-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+ 
+-        return NULL;
++#ifdef V4L2_BUF_FLAG_LAST
++        // If flag_last set then this contains data but is the last frame
++        // so remember that but return OK
++        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++            ctx->flag_last = 1;
++#endif
      }
  
-@@ -343,7 +450,8 @@ start:
-            no need to raise a warning */
-         if (timeout == 0) {
-             for (i = 0; i < ctx->num_buffers; i++) {
--                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
-+                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+                if (avbuf->status != V4L2BUF_AVAILABLE)
-                     av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-             }
-         }
-@@ -361,22 +469,25 @@ start:
-             ctx->done = 1;
-             return NULL;
-         }
+-    /* 1. handle resolution changes */
+-    if (pfd.revents & POLLPRI) {
+-        ret = v4l2_handle_event(ctx);
+-        if (ret < 0) {
+-            /* if re-init failed, abort */
+-            ctx->done = 1;
+-            return NULL;
+-        }
 -        if (ret) {
 -            /* if re-init was successful drop the buffer (if there was one)
 -             * since we had to reconfigure capture (unmap all buffers)
 -             */
 -            return NULL;
--        }
-+        if (ret > 0)
-+            goto start;
++    *ppavbuf = avbuf;
++    return 0;
++}
++
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++    AVCodecContext * const avctx = m->avctx;
++    struct v4l2_event evt = { 0 };
++
++    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++        const int rv = AVERROR(errno);
++        if (rv == AVERROR(EINTR))
++            continue;
++        if (rv == AVERROR(EAGAIN)) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++            return AVERROR_EOF;
+         }
++        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++        return rv;
      }
  
-     /* 2. dequeue the buffer */
-     if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
+-    /* 2. dequeue the buffer */
+-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
  
 -        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (is_capture) {
-             /* there is a capture buffer ready */
-             if (pfd.revents & (POLLIN | POLLRDNORM))
-                 goto dequeue;
- 
-+            // CAPTURE Q drained
-+            if (no_rx_means_done) {
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-+                return NULL;
-+            }
+-            /* there is a capture buffer ready */
+-            if (pfd.revents & (POLLIN | POLLRDNORM))
+-                goto dequeue;
++    if (evt.type == V4L2_EVENT_EOS) {
++        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++        return AVERROR_EOF;
++    }
 +
-             /* the driver is ready to accept more input; instead of waiting for the capture
-              * buffer to complete we return NULL so input can proceed (we are single threaded)
-              */
-@@ -394,37 +505,58 @@ dequeue:
-             buf.m.planes = planes;
++    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++        return do_source_change(m);
++
++    return 0;
++}
++
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++    const unsigned int poll_event = POLLPRI;
++
++    *ppavbuf = NULL;
+ 
+-            /* the driver is ready to accept more input; instead of waiting for the capture
+-             * buffer to complete we return NULL so input can proceed (we are single threaded)
+-             */
+-            if (pfd.revents & (POLLOUT | POLLWRNORM))
+-                return NULL;
++    for (;;) {
++        struct pollfd pfd = {
++            .fd = m->fd,
++            // If capture && stream not started then assume we are waiting for the initial event
++            .events = !is_cap ? poll_out :
++                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++                    poll_event,
++        };
++        int ret;
++
++        if (ctx->done) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++            return AVERROR_EOF;
+         }
+ 
+-dequeue:
+-        memset(&buf, 0, sizeof(buf));
+-        buf.memory = V4L2_MEMORY_MMAP;
+-        buf.type = ctx->type;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memset(planes, 0, sizeof(planes));
+-            buf.length = VIDEO_MAX_PLANES;
+-            buf.m.planes = planes;
++        // If capture && timeout == -1 then also wait for rx buffer free
++        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
++            pfd.events |= poll_out;
++
++        // If nothing Qed all we will get is POLLERR - avoid that
++        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
++            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
++            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++            return AVERROR(ENOSPC);
          }
  
 -        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
@@ -47553,61 +51047,72 @@
 -            if (errno != EAGAIN) {
 -                ctx->done = 1;
 -                if (errno != EPIPE)
-+        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
-+            const int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            if (err != EAGAIN) {
-+                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
-+                if (err != EPIPE || !is_capture)
-                     av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
 -                        ctx->name, av_err2str(AVERROR(errno)));
-+                        ctx->name, av_err2str(AVERROR(err)));
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-             }
-             return NULL;
-         }
-+        --ctx->q_count;
-+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
-+               ctx->name, buf.index,
-+               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-+               ctx->q_count, ++ctx->dq_count, buf.field);
-+
-+        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+        avbuf->status = V4L2BUF_AVAILABLE;
-+        avbuf->buf = buf;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-+            memcpy(avbuf->planes, planes, sizeof(planes));
-+            avbuf->buf.m.planes = avbuf->planes;
++        // Timeout kludged s.t. "forever" eventually gives up & produces logging
++        // If waiting for an event when we have seen a last_frame then we expect
++        //   it to be ready already so force a short timeout
++        ret = poll(&pfd, 1,
++                   ff_v4l2_ctx_eos(ctx) ? 10 :
++                   timeout == -1 ? 3000 : timeout);
++        if (ret < 0) {
++            ret = AVERROR(errno);  // Remember errno before logging etc.
++            av_assert0(ret < 0);
 +        }
++
++        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++               ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++        if (ret < 0) {
++            if (ret == AVERROR(EINTR))
++                continue;
++            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++            return ret;
++        }
++
++        if (ret == 0) {
++            if (timeout == -1)
++                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++            if (ff_v4l2_ctx_eos(ctx)) {
++                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++                ret = get_event(m);
++                if (ret < 0) {
++                    ctx->done = 1;
++                    return ret;
++                }
+             }
+-            return NULL;
++            return AVERROR(EAGAIN);
+         }
  
 -        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
-             int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                             buf.m.planes[0].bytesused : buf.bytesused;
-             if (bytesused == 0) {
--                ctx->done = 1;
-+                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
+-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+-                            buf.m.planes[0].bytesused : buf.bytesused;
+-            if (bytesused == 0) {
++        if ((pfd.revents & POLLERR) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++            return AVERROR_UNKNOWN;
++        }
 +
-+                // Must reQ so we don't leak
-+                // May not matter if the next thing we do is release all the
-+                // buffers but better to be tidy.
-+                ff_v4l2_buffer_enqueue(avbuf);
-+
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-                 return NULL;
++        if ((pfd.revents & poll_event) != 0) {
++            ret = get_event(m);
++            if (ret < 0) {
+                 ctx->done = 1;
+-                return NULL;
++                return ret;
              }
- #ifdef V4L2_BUF_FLAG_LAST
+-#ifdef V4L2_BUF_FLAG_LAST
 -            if (buf.flags & V4L2_BUF_FLAG_LAST)
 -                ctx->done = 1;
-+            if (buf.flags & V4L2_BUF_FLAG_LAST) {
-+                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
-+                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
-+                ctx_done(ctx);
-+            }
- #endif
+-#endif
++            continue;
++        }
++
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
          }
  
 -        avbuf = &ctx->buffers[buf.index];
@@ -47616,11 +51121,48 @@
 -        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
 -            memcpy(avbuf->planes, planes, sizeof(planes));
 -            avbuf->buf.m.planes = avbuf->planes;
--        }
-         return avbuf;
++        if ((pfd.revents & poll_out) != 0) {
++            if (is_cap)
++                return AVERROR(EAGAIN);
++            return dq_buf(ctx, ppavbuf);
+         }
+-        return avbuf;
++
++        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++        return AVERROR_UNKNOWN;
      }
++}
  
-@@ -443,8 +575,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
+-    return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++    struct v4l2_buffer *const buf = &avbuf->buf;
++
++    buf->flags = 0;
++    buf->field = V4L2_FIELD_ANY;
++    buf->timestamp = (struct timeval){0};
++    buf->timecode = (struct v4l2_timecode){0};
++    buf->sequence = 0;
++
++    return avbuf;
+ }
+ 
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+-    int timeout = 0; /* return when no more buffers to dequeue */
+     int i;
+ 
+     /* get back as many output buffers as possible */
+     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-          do {
+-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
++        V4L2Buffer * avbuf;
++        do {
++            get_qbuf(ctx, &avbuf, 0);
++        } while (avbuf);
      }
  
      for (i = 0; i < ctx->num_buffers; i++) {
@@ -47628,11 +51170,11 @@
 -            return &ctx->buffers[i];
 +        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
 +        if (avbuf->status == V4L2BUF_AVAILABLE)
-+            return avbuf;
++            return clean_v4l2_buffer(avbuf);
      }
  
      return NULL;
-@@ -452,25 +585,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
+@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
  
  static int v4l2_release_buffers(V4L2Context* ctx)
  {
@@ -47685,14 +51227,14 @@
 +                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
          }
      }
-+    ctx->q_count = 0;
++    atomic_store(&ctx->q_count, 0);
  
 -    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
 +    return ret;
  }
  
  static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +652,8 @@ static inline int v4l2_try_raw_format(V4
+@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
  
  static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
  {
@@ -47701,7 +51243,7 @@
      enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
      struct v4l2_fmtdesc fdesc;
      int ret;
-@@ -517,6 +672,13 @@ static int v4l2_get_raw_format(V4L2Conte
+@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
          if (ret)
              return AVERROR(EINVAL);
  
@@ -47715,7 +51257,7 @@
          pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
          ret = v4l2_try_raw_format(ctx, pixfmt);
          if (ret){
-@@ -569,18 +731,77 @@ static int v4l2_get_coded_format(V4L2Con
+@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
    *
    *****************************************************************************/
  
@@ -47730,9 +51272,9 @@
 +    for (i = 0; i < ctx->num_buffers; ++i) {
 +        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
 +        if (buf->status == V4L2BUF_IN_DRIVER)
-+            buf->status = V4L2BUF_AVAILABLE;
++            ff_v4l2_buffer_set_avail(buf);
 +    }
-+    ctx->q_count = 0;
++    atomic_store(&ctx->q_count, 0);
 +}
 +
 +static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
@@ -47762,18 +51304,25 @@
  int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
  {
      int type = ctx->type;
-     int ret;
+-    int ret;
++    int ret = 0;
 +    AVCodecContext * const avctx = logger(ctx);
-+
-+    ff_mutex_lock(&ctx->lock);
-+
-+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        stuff_all_buffers(avctx, ctx);
  
-     ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
 -    if (ret < 0)
 -        return AVERROR(errno);
-+    if (ret < 0) {
++    // Avoid doing anything if there is nothing we can do
++    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++        return 0;
+ 
+-    ctx->streamon = (cmd == VIDIOC_STREAMON);
++    ff_mutex_lock(&ctx->lock);
+ 
+-    return 0;
++    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++        stuff_all_buffers(avctx, ctx);
++
++    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
 +        const int err = errno;
 +        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
 +               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
@@ -47783,48 +51332,81 @@
 +    {
 +        if (cmd == VIDIOC_STREAMOFF)
 +            flush_all_buffers_status(ctx);
- 
--    ctx->streamon = (cmd == VIDIOC_STREAMON);
++        else
++            ctx->first_buf = 1;
++
 +        ctx->streamon = (cmd == VIDIOC_STREAMON);
 +        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
 +               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
 +    }
- 
--    return 0;
++
++    // Both stream off & on effectively clear flag_last
++    ctx->flag_last = 0;
++
 +    ff_mutex_unlock(&ctx->lock);
 +
 +    return ret;
  }
  
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -608,7 +829,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
++    int64_t track_ts;
+     V4L2Buffer* avbuf;
+     int ret;
+ 
+     if (!frame) {
+         ret = v4l2_stop_encode(ctx);
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+         s->draining= 1;
+         return 0;
+     }
+@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+     if (!avbuf)
+         return AVERROR(ENOMEM);
+ 
+-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+     if (ret)
+         return ret;
+ 
      return ff_v4l2_buffer_enqueue(avbuf);
  }
  
 -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
 +int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-+                                   const void * extdata, size_t extlen, int no_rescale_pts)
++                                   const void * extdata, size_t extlen)
  {
      V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer* avbuf;
-@@ -616,8 +838,9 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+     int ret;
++    int64_t track_ts;
  
      if (!pkt->size) {
          ret = v4l2_stop_decode(ctx);
 +        // Log but otherwise ignore stop failure
          if (ret)
 -            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
          s->draining = 1;
          return 0;
      }
-@@ -626,14 +849,17 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
      if (!avbuf)
          return AVERROR(EAGAIN);
  
 -    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
 -    if (ret)
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
++    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
 +    if (ret == AVERROR(ENOMEM))
 +        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
 +               __func__, pkt->size, avbuf->planes[0].length);
@@ -47832,23 +51414,70 @@
          return ret;
  
      return ff_v4l2_buffer_enqueue(avbuf);
- }
+@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
  
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer *avbuf;
++    int rv;
  
-@@ -650,7 +876,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
-         return AVERROR(EAGAIN);
-     }
+-    /*
+-     * timeout=-1 blocks until:
+-     *  1. decoded frame available
+-     *  2. an input buffer is ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
+-
+-        return AVERROR(EAGAIN);
+-    }
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv;
++        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
  
 -    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
++   return 0;
  }
  
  int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-@@ -702,78 +928,155 @@ int ff_v4l2_context_get_format(V4L2Conte
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * blocks until:
+-     *  1. encoded packet available
+-     *  2. an input buffer ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
++            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+ 
+-        return AVERROR(EAGAIN);
+-    }
+-
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    return 0;
+ }
+ 
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
  
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
@@ -47895,11 +51524,12 @@
 +    av_buffer_unref(&ctx->frames_ref);
 +
 +    ff_mutex_destroy(&ctx->lock);
++    pthread_cond_destroy(&ctx->cond);
  }
  
 -int ff_v4l2_context_init(V4L2Context* ctx)
 +
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
  {
 -    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
 +    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
@@ -47910,17 +51540,19 @@
 -        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
 -        return AVERROR_PATCHWELCOME;
 -    }
--
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
--    if (ret)
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
 +    int ret;
 +    int i;
  
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+-    if (ret)
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
++    av_assert0(ctx->bufrefs == NULL);
+ 
      memset(&req, 0, sizeof(req));
 -    req.count = ctx->num_buffers;
+-    req.memory = V4L2_MEMORY_MMAP;
 +    req.count = req_buffers;
-     req.memory = V4L2_MEMORY_MMAP;
++    req.memory = mem;
      req.type = ctx->type;
 -    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
 -    if (ret < 0) {
@@ -47955,7 +51587,7 @@
 +    }
 +
 +    for (i = 0; i < ctx->num_buffers; i++) {
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
++        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
 +        if (ret) {
              av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
 -            goto error;
@@ -47989,14 +51621,16 @@
 +
 +    // It is not valid to reinit a context without a previous release
 +    av_assert0(ctx->bufrefs == NULL);
-+
+ 
+-    av_freep(&ctx->buffers);
 +    if (!v4l2_type_supported(ctx)) {
 +        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
 +        return AVERROR_PATCHWELCOME;
 +    }
- 
--    av_freep(&ctx->buffers);
++
 +    ff_mutex_init(&ctx->lock, NULL);
++    pthread_cond_init(&ctx->cond, NULL);
++    atomic_init(&ctx->q_count, 0);
 +
 +    if (s->output_drm) {
 +        AVHWFramesContext *hwframes;
@@ -48010,8 +51644,8 @@
 +        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
 +        hwframes->format = AV_PIX_FMT_DRM_PRIME;
 +        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width;
-+        hwframes->height = ctx->height;
++        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
 +        ret = av_hwframe_ctx_init(ctx->frames_ref);
 +        if (ret < 0)
 +            goto fail_unref_hwframes;
@@ -48024,7 +51658,7 @@
 +        goto fail_unref_hwframes;
 +    }
 +
-+    ret = create_buffers(ctx, ctx->num_buffers);
++    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
 +    if (ret < 0)
 +        goto fail_unref_hwframes;
 +
@@ -48067,45 +51701,93 @@
  
      /**
       * Readonly after init.
-@@ -92,6 +100,12 @@ typedef struct V4L2Context {
+@@ -82,16 +90,38 @@ typedef struct V4L2Context {
+     int num_buffers;
+ 
+     /**
++     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++     */
++    enum v4l2_memory buf_mem;
++
++    /**
+      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+      */
+     int streamon;
+ 
++    /* 1st buffer after stream on */
++    int first_buf;
++
+     /**
+      *  Either no more buffers available or an unrecoverable error was notified
+      *  by the V4L2 kernel driver: once set the context has to be exited.
       */
      int done;
  
++    int flag_last;
++
++    /**
++     * If NZ then when Qing frame/pkt use this rather than the
++     * "real" PTS
++     */
++    uint64_t track_ts;
++
 +    AVBufferRef *frames_ref;
-+    int q_count;
-+    int dq_count;
++    atomic_int q_count;
 +    struct ff_weak_link_master *wl_master;
 +
 +    AVMutex lock;
++    pthread_cond_t cond;
  } V4L2Context;
  
  /**
-@@ -156,9 +170,12 @@ int ff_v4l2_context_dequeue_packet(V4L2C
+@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
   * @param[in] ctx The V4L2Context to dequeue from.
   * @param[inout] f The AVFrame to dequeue to.
   * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
-+ *       timestamp directly)
 + *
   * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ *                AVERROR(ENOSPC) if no buffer availible to put
++ *                the frame in
   */
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  
- /**
-  * Enqueues a buffer to a V4L2Context from an AVPacket
-@@ -170,7 +187,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
+@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
   * @param[in] pkt A pointer to an AVPacket.
   * @return 0 in case of success, a negative error otherwise.
   */
 -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
  
  /**
   * Enqueues a buffer to a V4L2Context from an AVFrame
 --- a/libavcodec/v4l2_m2m.c
 +++ b/libavcodec/v4l2_m2m.c
-@@ -215,13 +215,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+@@ -36,6 +36,14 @@
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
+ 
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
++
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
+ 
+     s->capture.done = s->output.done = 0;
+     s->capture.name = "capture";
++    s->capture.buf_mem = V4L2_MEMORY_MMAP;
+     s->output.name = "output";
++    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     atomic_init(&s->refcount, 0);
+     sem_init(&s->refsync, 0, 0);
+ 
+@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
          av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
  
      /* 2. unmap the capture buffers (v4l2 and ffmpeg):
@@ -48119,7 +51801,23 @@
      ff_v4l2_context_release(&s->capture);
  
      /* 3. get the new capture format */
-@@ -328,7 +322,10 @@ static void v4l2_m2m_destroy_context(voi
+@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ 
+     /* 5. complete reinit */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     return 0;
+ }
+@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
+ 
+     /* start again now that we know the stream dimensions */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     ret = ff_v4l2_context_get_format(&s->output, 0);
+     if (ret) {
+@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
      ff_v4l2_context_release(&s->capture);
      sem_destroy(&s->refsync);
  
@@ -48127,11 +51825,14 @@
 +    if (s->fd != -1)
 +        close(s->fd);
 +
++    av_packet_unref(&s->buf_pkt);
++    av_freep(&s->extdata_data);
++
 +    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
  
      av_free(s);
  }
-@@ -338,17 +335,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
+@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
      V4L2m2mContext *s = priv->context;
      int ret;
  
@@ -48172,6 +51873,51 @@
      av_buffer_unref(&priv->context_ref);
  
      return 0;
+@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
+     return v4l2_configure_contexts(s);
+ }
+ 
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+-    *s = av_mallocz(sizeof(V4L2m2mContext));
+-    if (!*s)
++    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++    *pps = NULL;
++    if (!s)
+         return AVERROR(ENOMEM);
+ 
+-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+                                          &v4l2_m2m_destroy_context, NULL, 0);
+     if (!priv->context_ref) {
+-        av_freep(s);
++        av_free(s);
+         return AVERROR(ENOMEM);
+     }
+ 
+     /* assign the context */
+-    priv->context = *s;
+-    (*s)->priv = priv;
++    priv->context = s;
++    s->priv = priv;
+ 
+     /* populate it */
+-    priv->context->capture.num_buffers = priv->num_capture_buffers;
+-    priv->context->output.num_buffers  = priv->num_output_buffers;
+-    priv->context->self_ref = priv->context_ref;
+-    priv->context->fd = -1;
++    s->capture.num_buffers = priv->num_capture_buffers;
++    s->output.num_buffers  = priv->num_output_buffers;
++    s->self_ref = priv->context_ref;
++    s->fd = -1;
++
++    xlat_init(&s->xlat);
+ 
++    *pps = s;
+     return 0;
+ }
 --- a/libavcodec/v4l2_m2m.h
 +++ b/libavcodec/v4l2_m2m.h
 @@ -30,6 +30,7 @@
@@ -48182,7 +51928,7 @@
  #include "v4l2_context.h"
  
  #define container_of(ptr, type, member) ({ \
-@@ -38,7 +39,28 @@
+@@ -38,7 +39,37 @@
  
  #define V4L_M2M_DEFAULT_OPTS \
      { "num_output_buffers", "Number of buffers in the output context",\
@@ -48192,8 +51938,10 @@
 +#define FF_V4L2_M2M_TRACK_SIZE 128
 +typedef struct V4L2m2mTrackEl {
 +    int     discard;   // If we see this buffer its been flushed, so discard
++    int     pending;
 +    int     pkt_size;
 +    int64_t pts;
++    int64_t dts;
 +    int64_t reordered_opaque;
 +    int64_t pkt_pos;
 +    int64_t pkt_duration;
@@ -48209,18 +51957,25 @@
 +    int64_t last_pts;
 +    int64_t guess;
 +} pts_stats_t;
++
++typedef struct xlat_track_s {
++    unsigned int track_no;
++    int64_t last_pts;
++    int64_t last_opaque;
++    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
  
  typedef struct V4L2m2mContext {
      char devname[PATH_MAX];
-@@ -53,6 +75,7 @@ typedef struct V4L2m2mContext {
+@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
+     AVCodecContext *avctx;
      sem_t refsync;
      atomic_uint refcount;
-     int reinit;
-+    int resize_pending;
+-    int reinit;
  
      /* null frame/packet received */
      int draining;
-@@ -63,6 +86,25 @@ typedef struct V4L2m2mContext {
+@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
  
      /* reference back to V4L2m2mPriv */
      void *priv;
@@ -48230,11 +51985,13 @@
 +    /* generate DRM frames */
 +    int output_drm;
 +
++    /* input frames are drmprime */
++    int input_drm;
++
 +    /* Frame tracking */
-+    int64_t last_pkt_dts;
-+    int64_t last_opaque;
-+    unsigned int track_no;
-+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++    xlat_track_t xlat;
++    int pending_hw;
++    int pending_n;
 +
 +    pts_stats_t pts_stat;
 +
@@ -48243,10 +52000,19 @@
 +
 +    /* Ext data sent */
 +    int extdata_sent;
++    /* Ext data sent in packet - overrides ctx */
++    uint8_t * extdata_data;
++    size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
++    /* Quirks */
++    unsigned int quirks;
++
  } V4L2m2mContext;
  
  typedef struct V4L2m2mPriv {
-@@ -73,6 +115,7 @@ typedef struct V4L2m2mPriv {
+@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
  
      int num_output_buffers;
      int num_capture_buffers;
@@ -48254,21 +52020,31 @@
  } V4L2m2mPriv;
  
  /**
-@@ -126,4 +169,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
   */
  int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
  
 +
-+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
 +{
 +    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
 +}
 +
-+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
 +{
 +    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
 +}
 +
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++    return ctx->flag_last;
++}
++
 +
  #endif /* AVCODEC_V4L2_M2M_H */
 --- a/libavcodec/v4l2_m2m_dec.c
@@ -48284,7 +52060,7 @@
  #include "libavutil/pixfmt.h"
  #include "libavutil/pixdesc.h"
  #include "libavutil/opt.h"
-@@ -30,26 +34,107 @@
+@@ -30,75 +34,107 @@
  #include "libavcodec/decode.h"
  #include "libavcodec/internal.h"
  
@@ -48296,19 +52072,31 @@
  #include "v4l2_m2m.h"
  #include "v4l2_fmt.h"
  
+-static int v4l2_try_start(AVCodecContext *avctx)
 +// Pick 64 for max last count - that is >1sec at 60fps
 +#define STATS_LAST_COUNT_MAX 64
 +#define STATS_INTERVAL_MAX (1 << 30)
 +
 +static int64_t pts_stats_guess(const pts_stats_t * const stats)
-+{
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+-    struct v4l2_selection selection = { 0 };
+-    int ret;
 +    if (stats->last_pts == AV_NOPTS_VALUE ||
 +            stats->last_interval == 0 ||
 +            stats->last_count >= STATS_LAST_COUNT_MAX)
 +        return AV_NOPTS_VALUE;
 +    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
 +}
-+
+ 
+-    /* 1. start the output process */
+-    if (!output->streamon) {
+-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+-        if (ret < 0) {
+-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+-            return ret;
 +static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
 +{
 +    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
@@ -48334,9 +52122,10 @@
 +                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
 +                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
 +            stats->last_interval = frame_time;
-+        }
-+    }
-+
+         }
+     }
+ 
+-    if (capture->streamon)
 +    stats->last_pts = pts;
 +    stats->last_count = 1;
 +}
@@ -48361,81 +52150,43 @@
 +    };
 +
 +    if (s->output.streamon)
-+        return 0;
-+
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
-+
-+    if (!s->capture.streamon || ret < 0)
-+        return ret;
-+
-+    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-+    else
-+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
-+
-+    return ret;
-+}
-+
- static int v4l2_try_start(AVCodecContext *avctx)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
-     struct v4l2_selection selection = { 0 };
-     int ret;
- 
-     /* 1. start the output process */
--    if (!output->streamon) {
--        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
--        if (ret < 0) {
--            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
--            return ret;
--        }
--    }
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
- 
-     if (capture->streamon)
          return 0;
-@@ -63,15 +148,29 @@ static int v4l2_try_start(AVCodecContext
+ 
+-    /* 2. get the capture format */
+-    capture->format.type = capture->type;
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
++    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++    if (ret != 0) {
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+         return ret;
      }
  
-     /* 2.1 update the AVCodecContext */
+-    /* 2.1 update the AVCodecContext */
 -    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
 -    capture->av_pix_fmt = avctx->pix_fmt;
-+    capture->av_pix_fmt =
-+        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-+    if (s->output_drm) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+        avctx->sw_pix_fmt = capture->av_pix_fmt;
-+    }
-+    else
-+        avctx->pix_fmt = capture->av_pix_fmt;
- 
-     /* 3. set the crop parameters */
-+#if 1
-+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
-+    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-+    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+#else
-     selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-     selection.r.height = avctx->coded_height;
-     selection.r.width = avctx->coded_width;
-+    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
-     ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+-
+-    /* 3. set the crop parameters */
+-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+-    selection.r.height = avctx->coded_height;
+-    selection.r.width = avctx->coded_width;
+-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
 -    if (!ret) {
-+    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+    if (1) {
-         ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-         if (ret) {
-             av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-@@ -82,15 +181,7 @@ static int v4l2_try_start(AVCodecContext
-             capture->width  = selection.r.width;
-         }
+-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+-        } else {
+-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+-            /* update the size of the resulting frame */
+-            capture->height = selection.r.height;
+-            capture->width  = selection.r.width;
+-        }
++    // STREAMON should do implicit START so this just for those that don't.
++    // It is optional so don't worry if it fails
++    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++        ret = AVERROR(errno);
++        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
      }
 -
 -    /* 4. init the capture context now that we have the capture format */
@@ -48445,126 +52196,126 @@
 -            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
 -            return AVERROR(ENOMEM);
 -        }
--    }
-+#endif
++    else {
++        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+     }
++    return 0;
++}
  
-     /* 5. start the capture process */
-     ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-@@ -133,52 +224,314 @@ static int v4l2_prepare_decoder(V4L2m2mC
+-    /* 5. start the capture process */
+-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+-        return ret;
+-    }
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int ret;
+ 
++    /* 1. start the output process */
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
+     return 0;
+ }
+ 
+@@ -133,52 +169,522 @@ static int v4l2_prepare_decoder(V4L2m2mC
      return 0;
  }
  
 -static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-+{
-+    return (int64_t)n;
-+}
-+
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-+{
-+    return (unsigned int)pts;
-+}
-+
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
 +static void
-+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
++set_best_effort_pts(AVCodecContext *const avctx,
++             pts_stats_t * const ps,
++             AVFrame *const frame)
 +{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++s->track_no == 0)
-+        s->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, s->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
-+    s->last_pkt_dts = avpkt->dts;
-+    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    avpkt->pts = track_pts;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
-+{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    const V4L2m2mTrackEl *const t = s->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = s->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pts;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        s->last_opaque = s->track_els[n].reordered_opaque;
-+        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    pts_stats_add(&s->pts_stat, frame->pts);
++    pts_stats_add(ps, frame->pts);
 +
 +#if FF_API_PKT_PTS
 +FF_DISABLE_DEPRECATION_WARNINGS
 +    frame->pkt_pts = frame->pts;
 +FF_ENABLE_DEPRECATION_WARNINGS
 +#endif
-+    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
-+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
-+    return 0;
++    frame->best_effort_timestamp = pts_stats_guess(ps);
++    // If we can't guess from just PTS - try DTS
++    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++        frame->best_effort_timestamp = frame->pkt_dts;
++
++    // We can't emulate what s/w does in a useful manner and using the
++    // "correct" answer seems to just confuse things.
++    frame->pkt_dts               = frame->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++    unsigned int i;
++    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++        x->track_els[i].pending = 0;
++        x->track_els[i].discard = 1;
++    }
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
++static int
++xlat_pending(const xlat_track_t * const x)
++{
++    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++    unsigned int i;
++    int r = 0;
++    int64_t now = AV_NOPTS_VALUE;
++
++    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
++        const V4L2m2mTrackEl * const t = x->track_els + n;
++
++        if (!t->pending)
++            continue;
++
++        if (now == AV_NOPTS_VALUE)
++            now = t->dts;
++
++        if (t->pts == AV_NOPTS_VALUE ||
++            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
++             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
++            ++r;
++    }
++
++    // If we never get any ideas about PTS vs DTS allow a lot more buffer
++    if (now == AV_NOPTS_VALUE)
++        r -= 16;
++
++    return r;
 +}
 +
 +static inline int stream_started(const V4L2m2mContext * const s) {
-+    return s->capture.streamon && s->output.streamon;
++    return s->output.streamon;
 +}
 +
 +#define NQ_OK        0
 +#define NQ_Q_FULL    1
 +#define NQ_SRC_EMPTY 2
-+#define NQ_DRAINING  3
-+#define NQ_DEAD      4
++#define NQ_NONE      3
++#define NQ_DRAINING  4
++#define NQ_DEAD      5
 +
 +#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get      If true then no new packet will be got but status will
++//                  be set appropriately
 +
 +// AVERROR_EOF     Flushing an already flushed stream
 +// -ve             Error (all errors except EOF are unexpected)
 +// NQ_OK (0)       OK
 +// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
 +// NQ_SRC_EMPTY    Src empty (do not retry)
++// NQ_NONE         Enqueue not attempted
 +// NQ_DRAINING     At EOS, dQ dest until EOS there too
 +// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
 +
-+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
  {
 -    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
 -    V4L2Context *const capture = &s->capture;
@@ -48581,8 +52332,47 @@
 +    // If we don't already have a coded packet - get a new one
 +    // We will already have a coded pkt if the output Q was full last time we
 +    // tried to Q it
-+    if (!s->buf_pkt.size) {
-+        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++    if (!s->buf_pkt.size && !do_not_get) {
++        unsigned int i;
++
++        for (i = 0; i < 256; ++i) {
++            uint8_t * side_data;
++            size_t side_size;
++
++            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++            if (ret != 0)
++                break;
++
++            // New extradata is the only side-data we undertand
++            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++            if (side_data) {
++                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++                av_freep(&s->extdata_data);
++                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
++                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
++                    return AVERROR(ENOMEM);
++                }
++                memcpy(s->extdata_data, side_data, side_size);
++                s->extdata_size = side_size;
++                s->extdata_sent = 0;
++            }
++
++            if (s->buf_pkt.size != 0)
++                break;
++
++            if (s->buf_pkt.side_data_elems == 0) {
++                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++                ret = AVERROR_EOF;
++                break;
++            }
++
++            // Retry a side-data only pkt
++        }
++        // If i >= 256 something has gone wrong
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++            return AVERROR(EIO);
++        }
 +
 +        if (ret == AVERROR(EAGAIN)) {
 +            if (!stream_started(s)) {
@@ -48606,7 +52396,7 @@
 +            if (!s->draining) {
 +                // Calling enqueue with an empty pkt starts drain
 +                av_assert0(s->buf_pkt.size == 0);
-+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
++                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
 +                if (ret) {
 +                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
 +                    return ret;
@@ -48619,22 +52409,37 @@
 +            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
              return ret;
 +        }
-+
-+        xlat_pts_in(avctx, s, &s->buf_pkt);
      }
  
 -    if (s->draining)
 -        goto dequeue;
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
++    if (s->draining) {
++        if (s->buf_pkt.size) {
++            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++            av_packet_unref(&s->buf_pkt);
++        }
++        return NQ_DRAINING;
++    }
  
 -    ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
 -    if (ret < 0) {
 -        if (ret != AVERROR(EAGAIN))
 -           return ret;
-+    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
-+                                         1);
++    if (!s->buf_pkt.size)
++        return NQ_NONE;
+ 
+-        s->buf_pkt = avpkt;
+-        /* no input buffers available, continue dequeing */
+-    }
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
++
++    if (s->extdata_sent)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++    else if (s->extdata_data)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
++    else
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
 +
 +    if (ret == AVERROR(EAGAIN)) {
 +        // Out of input buffers - keep packet
@@ -48645,18 +52450,18 @@
 +        av_packet_unref(&s->buf_pkt);
 +        s->extdata_sent = 1;
  
--        s->buf_pkt = avpkt;
--        /* no input buffers available, continue dequeing */
-+        if (ret) {
+-    if (avpkt.size) {
+-        ret = v4l2_try_start(avctx);
+         if (ret) {
+-            av_packet_unref(&avpkt);
 +            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
 +            return ret;
 +        }
-     }
++    }
  
--    if (avpkt.size) {
--        ret = v4l2_try_start(avctx);
--        if (ret) {
--            av_packet_unref(&avpkt);
+-            /* cant recover */
+-            if (ret == AVERROR(ENOMEM))
+-                return ret;
 +    // Start if we haven't
 +    {
 +        const int ret2 = v4l2_try_start(avctx);
@@ -48665,62 +52470,139 @@
 +            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
 +        }
 +    }
- 
--            /* cant recover */
--            if (ret == AVERROR(ENOMEM))
--                return ret;
++
 +    return ret;
 +}
 +
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++    int rv = 0;
+ 
+-            return 0;
++    ff_mutex_lock(&ctx->lock);
++
++    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++            rv = AVERROR(errno);
++            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++            break;
+         }
+     }
+ 
+-dequeue:
+-    if (!s->buf_pkt.size)
+-        av_packet_unref(&avpkt);
+-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
++    ff_mutex_unlock(&ctx->lock);
++    return rv;
++}
++
++// Number of frames over what xlat_pending returns that we keep *16
++// This is a min value - if it appears to be too small the threshold should
++// adjust dynamically.
++#define PENDING_HW_MIN      (3 * 16)
++// Offset to use when setting dynamically
++// Set to %16 == 15 to avoid the threshold changing immediately as we relax
++#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
++// Number of consecutive times we've failed to get a frame when we prefer it
++// before we increase the prefer threshold (5ms * N = max expected decode
++// time)
++#define PENDING_N_THRESHOLD 6
++
 +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 +{
 +    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-+    int src_rv;
++    int src_rv = NQ_OK;
 +    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
++    unsigned int i = 0;
 +
 +    do {
-+        src_rv = try_enqueue_src(avctx, s);
++        const int pending = xlat_pending(&s->xlat);
++        const int prefer_dq = (pending > s->pending_hw / 16);
++        const int last_src_rv = src_rv;
 +
-+        // If we got a frame last time and we have nothing to enqueue then
-+        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
++        // Enqueue another pkt for decode if
++        // (a) We don't have a lot of stuff in the buffer already OR
++        // (b) ... we (think we) do but we've failed to get a frame already OR
++        // (c) We've dequeued a lot of frames without asking for input
++        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++        // If we got a frame last time or we've already tried to get a frame and
++        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++        // indicating that we want more input.
 +        // This should mean that once decode starts we enter a stable state where
 +        // we alternately ask for input and produce output
-+        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
++        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
 +            break;
 +
-+        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
-+            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
-+            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
++        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++            break;
 +        }
 +
 +        // Try to get a new frame if
 +        // (a) we haven't already got one AND
 +        // (b) enqueue returned a status indicating that decode should be attempted
 +        if (dst_rv != 0 && TRY_DQ(src_rv)) {
-+            do {
-+                // Dequeue frame will unref any previous contents of frame
-+                // if it returns success so we don't need an explicit unref
-+                // when discarding
-+                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-+                // but there is room in the input Q
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
++            // Pick a timeout depending on state
++            const int t =
++                src_rv == NQ_DRAINING ? 300 :
++                prefer_dq ? 5 :
++                src_rv == NQ_Q_FULL ? -1 : 0;
 +
-+                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                           s->draining, s->capture.done);
-+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                           s->draining, s->capture.done, dst_rv);
++            // Dequeue frame will unref any previous contents of frame
++            // if it returns success so we don't need an explicit unref
++            // when discarding
++            // This returns AVERROR(EAGAIN) on timeout or if
++            // there is room in the input Q and timeout == -1
++            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 +
-+                // Go again if we got a frame that we need to discard
-+            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
++            // Failure due to no buffer in Q?
++            if (dst_rv == AVERROR(ENOSPC)) {
++                // Wait & retry
++                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++                }
++            }
++
++            // Adjust dynamic pending threshold
++            if (dst_rv == 0) {
++                if (--s->pending_hw < PENDING_HW_MIN)
++                    s->pending_hw = PENDING_HW_MIN;
++                s->pending_n = 0;
++
++                set_best_effort_pts(avctx, &s->pts_stat, frame);
++            }
++            else if (dst_rv == AVERROR(EAGAIN)) {
++                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
++                    s->pending_n = 0;
++                }
++            }
++
++            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++                dst_rv = AVERROR_EOF;
++                s->capture.done = 1;
++            }
++            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++                       s->draining, s->capture.done);
++            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++                       s->draining, s->capture.done, dst_rv);
 +        }
- 
--            return 0;
++
++        ++i;
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++            src_rv = AVERROR(EIO);
++        }
++
 +        // Continue trying to enqueue packets if either
 +        // (a) we succeeded last time OR
-+        // (b) enqueue failed due to input Q full AND there is now room
-+    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
++        // (b) we didn't ret a frame and we can retry the input
++    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
 +
 +    // Ensure that the frame contains nothing if we aren't returning a frame
 +    // (might happen when discarding)
@@ -48728,7 +52610,7 @@
 +        av_frame_unref(frame);
 +
 +    // If we got a frame this time ask for a pkt next time
-+    s->req_pkt = (dst_rv == 0);
++    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
 +
 +#if 0
 +    if (dst_rv == 0)
@@ -48738,14 +52620,10 @@
 +            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
 +            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
 +            return -1;
-         }
-     }
++        }
++    }
 +#endif
- 
--dequeue:
--    if (!s->buf_pkt.size)
--        av_packet_unref(&avpkt);
--    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
++
 +    return dst_rv == 0 ? 0 :
 +        src_rv < 0 ? src_rv :
 +        dst_rv < 0 ? dst_rv :
@@ -48774,15 +52652,114 @@
 +}
 +#endif
 +
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    unsigned int i;
++    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
++    const uint32_t w = avctx->coded_width;
++    const uint32_t h = avctx->coded_height;
++
++    if (w == 0 || h == 0 || fcc == 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++
++    for (i = 0;; ++i) {
++        struct v4l2_frmsizeenum fs = {
++            .index = i,
++            .pixel_format = fcc,
++        };
++
++        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++            const int err = AVERROR(errno);
++            if (err == AVERROR(EINTR))
++                continue;
++            if (i == 0 && err == AVERROR(ENOTTY)) {
++                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++                return 0;
++            }
++            if (err != AVERROR(EINVAL)) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++                return err;
++            }
++            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++                   w, h, av_fourcc2str(fcc), i);
++            return err;
++        }
++
++        switch (fs.type) {
++            case V4L2_FRMSIZE_TYPE_DISCRETE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++                       fs.discrete.width,fs.discrete.height);
++                if (w == fs.discrete.width && h == fs.discrete.height)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_STEPWISE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++                    return 0;
++                break;
++            default:
++                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++                return AVERROR(EINVAL);
++        }
++    }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_capability cap;
++
++    memset(&cap, 0, sizeof(cap));
++    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++        int err = errno;
++        if (err == EINTR)
++            continue;
++        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++        return AVERROR(err);
++    }
++
++    // Could be made table driven if we have a few more but right now there
++    // seems no point
++
++    // Meson (amlogic) always gives a resolution changed event after output
++    // streamon and userspace must (re)allocate capture buffers and streamon
++    // capture to clear the event even if the capture buffers were the right
++    // size in the first place.
++    if (strcmp(cap.driver, "meson-vdec") == 0)
++        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++    return 0;
++}
++
++// This heuristic is for H264 but use for everything
 +static uint32_t max_coded_size(const AVCodecContext * const avctx)
 +{
 +    uint32_t wxh = avctx->coded_width * avctx->coded_height;
 +    uint32_t size;
 +
-+    // Currently the only thing we try to set our own limits for is H264
-+    if (avctx->codec_id != AV_CODEC_ID_H264)
-+        return 0;
-+
 +    size = wxh * 3 / 2;
 +    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
 +    // unfortunately that doesn't yield an actually useful limit
@@ -48796,7 +52773,7 @@
  }
  
  static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-@@ -186,12 +539,28 @@ static av_cold int v4l2_decode_init(AVCo
+@@ -186,12 +692,29 @@ static av_cold int v4l2_decode_init(AVCo
      V4L2Context *capture, *output;
      V4L2m2mContext *s;
      V4L2m2mPriv *priv = avctx->priv_data;
@@ -48821,11 +52798,21 @@
          return ret;
  
 +    pts_stats_init(&s->pts_stat, avctx, "decoder");
++    s->pending_hw = PENDING_HW_MIN;
 +
      capture = &s->capture;
      output = &s->output;
  
-@@ -204,17 +573,43 @@ static av_cold int v4l2_decode_init(AVCo
+@@ -199,34 +722,127 @@ static av_cold int v4l2_decode_init(AVCo
+      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+      * the proper values will be retrieved from the kernel driver.
+      */
+-    output->height = capture->height = avctx->coded_height;
+-    output->width = capture->width = avctx->coded_width;
++//    output->height = capture->height = avctx->coded_height;
++//    output->width = capture->width = avctx->coded_width;
++    output->height = capture->height = 0;
++    output->width = capture->width = 0;
  
      output->av_codec_id = avctx->codec_id;
      output->av_pix_fmt  = AV_PIX_FMT_NONE;
@@ -48842,15 +52829,21 @@
 +     *       check the v4l2_get_drm_frame function.
 +     */
 +
++    avctx->sw_pix_fmt = avctx->pix_fmt;
 +    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++           avctx->coded_width, avctx->coded_height,
++           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
 +
-+    s->output_drm = 0;
 +    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
 +        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
 +        s->output_drm = 1;
 +    }
++    else {
++        capture->av_pix_fmt = gf_pix_fmt;
++        s->output_drm = 0;
++    }
 +
 +    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
 +    if (!s->device_ref) {
@@ -48872,14 +52865,23 @@
          return ret;
      }
  
-@@ -223,10 +618,53 @@ static av_cold int v4l2_decode_init(AVCo
+-    return v4l2_prepare_decoder(s);
++    if ((ret = v4l2_prepare_decoder(s)) < 0)
++        return ret;
++
++    if ((ret = get_quirks(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = check_size(avctx, s)) != 0)
++        return ret;
++
++    return 0;
+ }
  
  static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  {
 -    V4L2m2mPriv *priv = avctx->priv_data;
 -    V4L2m2mContext *s = priv->context;
--    av_packet_unref(&s->buf_pkt);
--    return ff_v4l2_m2m_codec_end(priv);
 +    int rv;
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 +    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
@@ -48901,7 +52903,6 @@
 +    V4L2m2mContext * const s = priv->context;
 +    V4L2Context * const output = &s->output;
 +    V4L2Context * const capture = &s->capture;
-+    int ret, i;
 +
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
 +
@@ -48909,14 +52910,23 @@
 +    // states like EOS processing so don't try to optimize out (having got it
 +    // wrong once)
 +
-+    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
++    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++    // Clear any buffered input packet
+     av_packet_unref(&s->buf_pkt);
+-    return ff_v4l2_m2m_codec_end(priv);
++
++    // Clear a pending EOS
++    if (ff_v4l2_ctx_eos(capture)) {
++        // Arguably we could delay this but this is easy and doesn't require
++        // thought or extra vars
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++    }
 +
 +    // V4L2 makes no guarantees about whether decoded frames are flushed or not
 +    // so mark all frames we are tracking to be discarded if they appear
-+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
-+        s->track_els[i].discard = 1;
++    xlat_flush(&s->xlat);
 +
 +    // resend extradata
 +    s->extdata_sent = 0;
@@ -48930,7 +52940,7 @@
  }
  
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -235,10 +673,16 @@ static av_cold int v4l2_decode_close(AVC
+@@ -235,10 +851,16 @@ static av_cold int v4l2_decode_close(AVC
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -48948,7 +52958,7 @@
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -259,9 +703,15 @@ static const AVOption options[] = {
+@@ -259,9 +881,15 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -48965,6 +52975,365 @@
          .wrapper_name   = "v4l2m2m", \
      }
  
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "libavutil/pixdesc.h"
+@@ -37,6 +39,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+ 
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+     struct v4l2_streamparm parm = { 0 };
+@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+     if (s->avctx->max_b_frames)
+-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+ 
+-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+     if (s->avctx->max_b_frames == 0)
+         return 0;
+ 
+     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+     return AVERROR_PATCHWELCOME;
+ }
+ 
+@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
+     return 0;
+ }
+ 
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        case DRM_FORMAT_P030:
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++    if (a->type != b->type)
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++        unsigned int i;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->num_planes != pb->num_planes)
++            return 0;
++        for (i = 0; i != pa->num_planes; ++i) {
++            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++                return 0;
++        }
++    }
++    else {
++        const struct v4l2_pix_format *const pa = &a->fmt.pix;
++        const struct v4l2_pix_format *const pb = &b->fmt.pix;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->bytesperline != pb->bytesperline)
++            return 0;
++    }
++    return 1;
++}
++
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+     V4L2Context *const output = &s->output;
+ 
++    // Signal EOF if needed
++    if (!frame) {
++        return ff_v4l2_context_enqueue_frame(output, frame);
++    }
++
++    if (s->input_drm && !output->streamon) {
++        int rv;
++        struct v4l2_format req_format = {.type = output->format.type};
++
++        // Set format when we first get a buffer
++        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++            return rv;
++        }
++
++        ff_v4l2_context_release(output);
++
++        output->format = req_format;
++
++        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++            return rv;
++        }
++
++        if (!fmt_eq(&req_format, &output->format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++            return AVERROR(EINVAL);
++        }
++
++        output->selection.top = frame->crop_top;
++        output->selection.left = frame->crop_left;
++        output->selection.width = av_frame_cropped_width(frame);
++        output->selection.height = av_frame_cropped_height(frame);
++
++        if ((rv = ff_v4l2_context_init(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++            return rv;
++        }
++
++        {
++            struct v4l2_selection selection = {
++                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++                .target = V4L2_SEL_TGT_CROP,
++                .r = output->selection
++            };
++            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++                       av_err2str(AVERROR(errno)));
++            }
++            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++        }
++    }
++
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++    if (frame->pict_type == AV_PICTURE_TYPE_I)
+         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+ 
+@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
+     }
+ 
+ dequeue:
+-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
++    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++        return ret;
++
++    if (capture->first_buf == 1) {
++        uint8_t * data;
++        const int len = avpkt->size;
++
++        // 1st buffer after streamon should be SPS/PPS
++        capture->first_buf = 2;
++
++        // Clear both possible stores so there is no chance of confusion
++        av_freep(&s->extdata_data);
++        s->extdata_size = 0;
++        av_freep(&avctx->extradata);
++        avctx->extradata_size = 0;
++
++        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
++            memcpy(data, avpkt->data, len);
++
++        av_packet_unref(avpkt);
++
++        if (data == NULL)
++            return AVERROR(ENOMEM);
++
++        // We need to copy the header, but keep local if not global
++        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++            avctx->extradata = data;
++            avctx->extradata_size = len;
++        }
++        else {
++            s->extdata_data = data;
++            s->extdata_size = len;
++        }
++
++        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++            return ret;
++    }
++
++    // First frame must be key so mark as such even if encoder forgot
++    if (capture->first_buf == 2)
++        avpkt->flags |= AV_PKT_FLAG_KEY;
++
++    // Add SPS/PPS to the start of every key frame if non-global headers
++    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++        const size_t newlen = s->extdata_size + avpkt->size;
++        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++        if (buf == NULL) {
++            av_packet_unref(avpkt);
++            return AVERROR(ENOMEM);
++        }
++
++        memcpy(buf->data, s->extdata_data, s->extdata_size);
++        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++        av_buffer_unref(&avpkt->buf);
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++        avpkt->size = newlen;
++    }
++
++//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
++    capture->first_buf = 0;
++    return 0;
+ }
+ 
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
+     uint32_t v4l2_fmt_output;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
+     capture = &s->capture;
+     output  = &s->output;
+ 
++    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+     /* common settings output/capture */
+     output->height = capture->height = avctx->height;
+     output->width = capture->width = avctx->width;
+ 
+     /* output context */
+     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+-    output->av_pix_fmt = avctx->pix_fmt;
++    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++            AV_PIX_FMT_YUV420P;
+ 
+     /* capture context */
+     capture->av_codec_id = avctx->codec_id;
+@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
+         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+ 
+     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+-    if (pix_fmt_output != avctx->pix_fmt) {
++    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+         return AVERROR(EINVAL);
 --- /dev/null
 +++ b/libavcodec/v4l2_req_decode_q.c
 @@ -0,0 +1,84 @@
@@ -49879,8 +54248,14 @@
 +#include "v4l2_req_hevc_vx.c"
 +
 --- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
 +++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -0,0 +1,1211 @@
+@@ -0,0 +1,1228 @@
 +// File included by v4l2_req_hevc_v* - not compiled on its own
 +
 +#include "decode.h"
@@ -49897,6 +54272,8 @@
 +
 +#elif HEVC_CTRLS_VERSION == 2
 +#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
 +#else
 +#error Unknown HEVC_CTRLS_VERSION
 +#endif
@@ -50028,6 +54405,7 @@
 +    }
 +}
 +
++#if HEVC_CTRLS_VERSION <= 2
 +static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
 +{
 +    const HEVCFrame *frame;
@@ -50053,6 +54431,7 @@
 +
 +    return 0;
 +}
++#endif
 +
 +static unsigned int
 +get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
@@ -50128,7 +54507,12 @@
 +            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
 +
 +            entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
 +            entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
 +            entry->field_pic = frame->frame->interlaced_frame;
 +
 +            /* TODO: Interleaved: Get the POC for each field. */
@@ -50892,14 +55276,22 @@
 +    };
 +    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
 +
++#if HEVC_CTRLS_VERSION == 2
++    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#elif HEVC_CTRLS_VERSION == 3
++    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#endif
++
 +    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
 +        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
 +        return AVERROR(EINVAL);
 +    }
 +    for (i = 0; i != noof_ctrls; ++i) {
-+        if (ctrl_sizes[i] != qc[i].elem_size) {
-+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %u != %u\n",
-+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], qc[i].elem_size);
++        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
 +            return AVERROR(EINVAL);
 +        }
 +    }
@@ -51094,7 +55486,7 @@
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_req_media.c
-@@ -0,0 +1,1596 @@
+@@ -0,0 +1,1601 @@
 +/*
 + * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
 + *
@@ -51701,6 +56093,7 @@
 +
 +    struct v4l2_format src_fmt;
 +    struct v4l2_format dst_fmt;
++    struct v4l2_capability capability;
 +};
 +
 +static int qe_v4l2_queue(struct qent_base *const be,
@@ -51871,13 +56264,13 @@
 +{
 +    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
 +        size_t newsize = round_up_size(len);
-+        request_log("%s: Overrun %d > %d; trying %d\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
 +        if (!dbsc) {
 +            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
 +            return -ENOMEM;
 +        }
 +        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
-+            request_log("%s: Realloc %d failed\n", __func__, newsize);
++            request_log("%s: Realloc %zd failed\n", __func__, newsize);
 +            return -ENOMEM;
 +        }
 +    }
@@ -52595,20 +56988,24 @@
 +    mediabufs_ctl_delete(mbc);
 +}
 +
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++    return mbc->capability.version;
++}
++
 +static int set_capabilities(struct mediabufs_ctl *const mbc)
 +{
-+    struct v4l2_capability capability = { 0 };
 +    uint32_t caps;
 +
-+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
++    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
 +        int err = errno;
 +        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
 +        return -err;
 +    }
 +
-+    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-+            capability.device_caps :
-+            capability.capabilities;
++    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++            mbc->capability.device_caps :
++            mbc->capability.capabilities;
 +
 +    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
 +        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
@@ -52693,7 +57090,7 @@
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_req_media.h
-@@ -0,0 +1,151 @@
+@@ -0,0 +1,154 @@
 +/*
 +e.h
 +*
@@ -52838,6 +57235,9 @@
 +                  struct dmabufs_ctl * const dbsc,
 +                  unsigned int n);
 +
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
 +struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
 +                     const char *vpath, struct pollqueue *const pq);
 +void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
@@ -53257,7 +57657,7 @@
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,296 @@
+@@ -0,0 +1,311 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -53406,6 +57806,17 @@
 +
 +    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
 +
++    // Give up immediately if this is something that we have no code to deal with
++    if (h->ps.sps->chroma_format_idc != 1) {
++        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++        return AVERROR_PATCHWELCOME;
++    }
++    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++        return AVERROR_PATCHWELCOME;
++    }
++
 +    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
 +        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
 +        return (AVERROR(-ret));
@@ -53458,7 +57869,11 @@
 +        goto fail4;
 +    }
 +
-+    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++    }
++    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
 +        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
 +        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
 +    }
@@ -53556,7 +57971,7 @@
 +};
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.h
-@@ -0,0 +1,101 @@
+@@ -0,0 +1,102 @@
 +#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
 +#define AVCODEC_V4L2_REQUEST_HEVC_H
 +
@@ -53656,8 +58071,111 @@
 +
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
 +
 +#endif
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
+             size = next - start - 4;
+             if (size <= 0)
+                 continue;
+-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+             init_get_bits(&gb, buf2, buf2_size * 8);
+             switch (AV_RB32(start)) {
+             case VC1_CODE_SEQHDR:
+@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
+                 case VC1_CODE_FRAME:
+                     if (avctx->hwaccel)
+                         buf_start = start;
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     break;
+                 case VC1_CODE_FIELD: {
+                     int buf_size3;
+@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
+                     break;
+                 }
+                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
+                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
+                     break;
+@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
+                     ret = AVERROR(ENOMEM);
+                     goto err;
+                 }
+-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                               buf_size3 << 3);
+                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
+                 n_slices1 = n_slices - 1;
+                 n_slices++;
+             }
+-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+         } else {
+-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+         }
+         init_get_bits(&s->gb, buf2, buf_size2*8);
+     } else
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+ 
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+ 
+     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
+ 
+     if (ARCH_AARCH64)
+         ff_vc1dsp_init_aarch64(dsp);
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+      * one or more further zero bytes and a one byte.
+      */
+     int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++    /* Copy a buffer, removing startcode emulation escape bytes as we go */
++    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+ 
+ void ff_vc1dsp_init(VC1DSPContext* c);
 --- /dev/null
 +++ b/libavcodec/weak_link.c
 @@ -0,0 +1,102 @@
@@ -54461,7 +58979,7 @@
 +
 --- /dev/null
 +++ b/libavdevice/egl_vout.c
-@@ -0,0 +1,825 @@
+@@ -0,0 +1,816 @@
 +/*
 + * Copyright (c) 2020 John Cox for Raspberry Pi Trading
 + *
@@ -54504,16 +59022,8 @@
 +#include <stdatomic.h>
 +#include <unistd.h>
 +
-+#include "drm_fourcc.h"
-+#include <drm.h>
-+#include <drm_mode.h>
-+#include <xf86drm.h>
-+#include <xf86drmMode.h>
 +#include <X11/Xlib.h>
 +#include <X11/Xutil.h>
-+#include <X11/Xlib-xcb.h>
-+#include <xcb/xcb.h>
-+#include <xcb/dri3.h>
 +
 +#include "libavutil/rpi_sand_fns.h"
 +
@@ -54725,8 +59235,7 @@
 +   XMapWindow(dpy, win);
 +
 +   {
-+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config,
-+                                               (void *)(uintptr_t)win, NULL);
++      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
 +      if (!surf) {
 +         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
 +         return -1;
@@ -57662,7 +62171,7 @@
 +
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,676 @@
+@@ -0,0 +1,781 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -57913,228 +62422,6 @@
 +    ret
 +endfunc
 +
-+//void ff_rpi_sand30_lines_to_planar_y16(
-+//  uint8_t * dest,             // [x0]
-+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-+//  const uint8_t * src,        // [x2]
-+//  unsigned int src_stride1,   // [w3] -> 128
-+//  unsigned int src_stride2,   // [w4]
-+//  unsigned int _x,            // [w5]
-+//  unsigned int y,             // [w6]
-+//  unsigned int _w,            // [w7]
-+//  unsigned int h);            // [sp, #0]
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    // w6 = argument h
-+    ldr w6, [sp, #48]
-+
-+    // slice_inc = ((stride2 - 1) * stride1)
-+    mov w5, w4
-+    sub w5, w5, #1
-+    lsl w5, w5, #7
-+
-+    // total number of bytes per row = (width / 3) * 4
-+    mov w8, w7
-+    mov w9, #3
-+    udiv w8, w8, w9
-+    lsl w8, w8, #2
-+
-+    // number of full 128 byte blocks to be processed
-+    mov w9, #96
-+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-+
-+    // w10 = number of full integers to process (4 bytes)
-+    // w11 = remaning zero to two 10bit values still to copy over
-+    mov w12, #96
-+    mul w12, w9, w12
-+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-+    mov w11, #3
-+    udiv w10, w12, w11 // full integers to process = w12 / 3 
-+    mul w11, w10, w11  // #integers *3
-+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-+
-+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-+    // this is to efficiently copy incomplete blocks at the end of the rows
-+    // the last row is handled explicitly to avoid writing out of bounds
-+    add w22, w10, w11
-+    cmp w22, #0
-+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-+    add w9, w9, w22
-+    sub w6, w6, #1
-+
-+    // store the number of bytes in w20 which we copy too much for every row
-+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-+    mov w20, #96*2
-+    mul w20, w20, w9
-+    sub w20, w1, w20
-+
-+    mov w23, #0 // flag to check whether the last line had already been processed
-+    
-+    // bitmask to clear the uppper 6bits of the result values
-+    mov x19, #0x03ff03ff03ff03ff
-+    dup v22.2d, x19
-+
-+    // row counter = 0
-+    eor w12, w12, w12
-+row_loop_y16:
-+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-+    bge row_loop_y16_fin
-+
-+    mov x13, x2               // row src
-+    eor w14, w14, w14         // full block counter
-+block_loop_y16:
-+    cmp w14, w9
-+    bge block_loop_y16_fin
-+
-+    // load 64 bytes
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+   
-+    // process v0 and v1
-+    xtn v16.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v17.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v18.4h, v0.4s
-+   
-+    xtn2 v16.8h, v1.4s
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v17.8h, v1.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v18.8h, v1.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // process v2 and v3
-+    xtn v23.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v24.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v25.4h, v2.4s
-+    
-+    xtn2 v23.8h, v3.4s
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v24.8h, v3.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v25.8h, v3.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+
-+    // load the second half of the block -> 64 bytes into registers v4-v7
-+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-+    
-+    // process v4 and v5
-+    xtn v16.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v17.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v18.4h, v4.4s
-+   
-+    xtn2 v16.8h, v5.4s 
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v17.8h, v5.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v18.8h, v5.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // v6 and v7
-+    xtn v23.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v24.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v25.4h, v6.4s
-+   
-+    xtn2 v23.8h, v7.4s 
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v24.8h, v7.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v25.8h, v7.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+ 
-+    add x13, x13, x5          // row src += slice_inc
-+    add w14, w14, #1
-+    b block_loop_y16
-+block_loop_y16_fin:
-+
-+    
-+
-+
-+    add x2, x2, #128          // src += stride1 (start of the next row)
-+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-+    add w12, w12, #1
-+    b row_loop_y16
-+row_loop_y16_fin:
-+
-+    // check whether we have incomplete blocks at the end of every row
-+    // in that case decrease row block count by one
-+    // change height back to it's original value (meaning increase it by 1)
-+    // and jump back to another iteration of row_loop_y16
-+
-+    cmp w23, #1
-+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-+    add w6, w6, #1    // increase height to the original value
-+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-+    mov w23, #1
-+    b row_loop_y16
-+row_loop_y16_fin2:
-+
-+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-+
-+    // now we've got to handle the last block in the last row
-+    eor w12, w12, w12 // w12 = 0 = counter
-+integer_loop_y16:
-+    cmp w12, w10
-+    bge integer_loop_y16_fin
-+    ldr w14, [x13], #4
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    add w12, w12, #1
-+    b integer_loop_y16
-+integer_loop_y16_fin:
-+
-+final_values_y16:
-+    // remaining point count = w11
-+    ldr w14, [x13], #4
-+    cmp w11, #0
-+    beq final_values_y16_fin
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    cmp w11, #1
-+    beq final_values_y16_fin
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+final_values_y16_fin:
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
 +//void ff_rpi_sand30_lines_to_planar_c16(
 +//  uint8_t * dst_u,            // [x0]
 +//  unsigned int dst_stride_u,  // [w1] == _w*2
@@ -58339,9 +62626,336 @@
 +//  unsigned int _w,
 +//  unsigned int h);
 +
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7, lsl #1
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #14
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #10
++
++                shrn2           v18.8h,  v1.4s,   #14
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #10
++
++                ushr            v18.8h,  v18.8h,  #6
++                bic             v16.8h,  #0xfc,   lsl #8
++                bic             v17.8h,  #0xfc,   lsl #8
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #14
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #10
++
++                shrn2           v21.8h,  v3.4s,   #14
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #10
++
++                ushr            v21.8h,  v21.8h,  #6
++                bic             v19.8h,  #0xfc,   lsl #8
++                bic             v20.8h,  #0xfc,   lsl #8
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #14
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #10
++
++                shrn2           v24.8h,  v5.4s,   #14
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #10
++
++                ushr            v24.8h,  v24.8h,  #6
++                bic             v22.8h,  #0xfc,   lsl #8
++                bic             v23.8h,  #0xfc,   lsl #8
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #14
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #10
++
++                shrn2           v27.8h,  v7.4s,   #14
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #10
++
++                ushr            v27.8h,  v27.8h,  #6
++                bic             v25.8h,  #0xfc,   lsl #8
++                bic             v26.8h,  #0xfc,   lsl #8
++
++                blt             2f
++
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
++                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++                mov             v19.16b, v25.16b
++                mov             v20.16b, v26.16b
++                mov             v21.16b, v27.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v19.16b
++                mov             v17.16b, v20.16b
++                sub             w5,  w5,  #24
++                mov             v18.16b, v21.16b
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #12
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #6
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #3
++                mov             v17.4h[0], v17.4h[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.h, v17.h}[0], [x0], #4
++                b               11b
++1:
++                st1             {v16.h}[0], [x0], #2
++                b               11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #16
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #12
++
++                shrn2           v18.8h,  v1.4s,   #16
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #12
++
++                shrn            v18.8b,  v18.8h,  #6
++                shrn            v16.8b,  v16.8h,  #2
++                xtn             v17.8b,  v17.8h
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #16
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #12
++
++                shrn2           v21.8h,  v3.4s,   #16
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #12
++
++                shrn2           v18.16b, v21.8h,  #6
++                shrn2           v16.16b, v19.8h,  #2
++                xtn2            v17.16b, v20.8h
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #16
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #12
++
++                shrn2           v24.8h,  v5.4s,   #16
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #12
++
++                shrn            v21.8b,  v24.8h,  #6
++                shrn            v19.8b,  v22.8h,  #2
++                xtn             v20.8b,  v23.8h
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #16
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #12
++
++                shrn2           v27.8h,  v7.4s,   #16
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #12
++
++                shrn2           v21.16b, v27.8h,  #6
++                shrn2           v19.16b, v25.8h,  #2
++                xtn2            v20.16b, v26.8h
++
++                blt             2f
++
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #24
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #12
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #6
++                mov             v17.4h[0], v17.4h[1]
++                mov             v18.4h[0], v18.4h[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                beq             11b
++                mov             v16.8b[0], v16.8b[1]
++                sub             w5,  w5,  #3
++                mov             v17.8b[0], v17.8b[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.b, v17.b}[0], [x0], #2
++                b               11b
++1:
++                st1             {v16.b}[0], [x0], #1
++                b               11b
++
++endfunc
++
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,59 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -58393,6 +63007,10 @@
 +  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
 +  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
 +
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
 +#ifdef __cplusplus
 +}
 +#endif
@@ -58406,7 +63024,7 @@
 +             arm/rpi_sand_neon.o                                        \
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,768 @@
+@@ -0,0 +1,925 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -58769,7 +63387,6 @@
 +                ldr             r6,  [sp, #36]
 +                ldr             r7,  [sp, #32]  @ y
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r3,  #1
 +                lsl             r3,  #7
 +                sub             r1,  r1,  r6,  lsl #1
@@ -58785,37 +63402,33 @@
 +                vldm            r2!, {q10-q13}
 +                add             lr,  #64
 +
-+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
++                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
 +                ands            lr,  #127
 +                vshrn.u32       d2,  q10, #10
 +                vmovn.u32       d0,  q10
-+                vmovn.u32       d4,  q14
 +
-+                vshr.u32        q14, q11, #20
++                vshrn.u32       d5,  q11, #14
 +                it              eq
 +                addeq           r2,  r3
 +                vshrn.u32       d3,  q11, #10
 +                vmovn.u32       d1,  q11
-+                vmovn.u32       d5,  q14
 +
 +                subs            r5,  #48
-+                vand            q0,  q15
-+                vand            q1,  q15
-+                vand            q2,  q15
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
-+                vshr.u32        q14, q12, #20
++                vshrn.u32       d20, q12, #14
 +                vshrn.u32       d18, q12, #10
 +                vmovn.u32       d16, q12
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q13, #20
++                vshrn.u32       d21, q13, #14
 +                vshrn.u32       d19, q13, #10
 +                vmovn.u32       d17, q13
-+                vmovn.u32       d21, q14
 +
-+                vand            q8,  q15
-+                vand            q9,  q15
-+                vand            q10, q15
++                vshr.u16        q10, #6
++                vbic.u16        q8,  #0xfc00
++                vbic.u16        q9 , #0xfc00
 +                blt             2f
 +
 +                vst3.16         {d0,  d2,  d4},  [r0], r12
@@ -58908,7 +63521,6 @@
 +                ldr             r7,  [sp, #48]
 +                ldr             r9,  [sp, #52]
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r8,  #1
 +                lsl             r8,  #7
 +                add             r5,  r5,  r7,  lsl #7
@@ -58924,48 +63536,44 @@
 +                add             lr,  #64
 +
 +                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-+                vshr.u32        q14, q0,  #20
-+                vshrn.u32       d16, q0,  #10
++                vshrn.u32       d20, q0,  #14
 +                vmovn.u32       d18, q0
++                vshrn.u32       d0,  q0,  #10
 +                ands            lr,  #127
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q1,  #20
-+                vshrn.u32       d17, q1,  #10
++                vshrn.u32       d21, q1,  #14
 +                vmovn.u32       d19, q1
-+                vmovn.u32       d21, q14
++                vshrn.u32       d1,  q1,  #10
 +
-+                vshr.u32        q14, q2,  #20
 +                vshrn.u32       d22, q2,  #10
-+                vmovn.u32       d24, q2
-+                vmovn.u32       d26, q14
++                vmovn.u32       d2,  q2
++                vshrn.u32       d4,  q2,  #14
 +
-+                vshr.u32        q14, q3,  #20
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d25, q3
 +                add             r10, r0,  #24
-+                vmovn.u32       d27, q14
++                vshrn.u32       d23, q3,  #10
++                vmovn.u32       d3,  q3
++                vshrn.u32       d5,  q3,  #14
 +
 +                it              eq
 +                addeq           r4,  r8
-+                vuzp.16         q8,  q11
-+                vuzp.16         q9,  q12
-+                vuzp.16         q10, q13
++                vuzp.16         q0,  q11
++                vuzp.16         q9,  q1
++                vuzp.16         q10, q2
 +
-+                @ q8   V0, V3,.. -> q0
++                @ q0   V0, V3,..
 +                @ q9   U0, U3...
 +                @ q10  U1, U4...
 +                @ q11  U2, U5,..
-+                @ q12  V1, V4,.. -> q1
-+                @ q13  V2, V5,.. -> q2
++                @ q1   V1, V4,
++                @ q2   V2, V5,..
 +
 +                subs            r6,  #24
-+                vand            q11, q15
-+                vand            q9,  q15
-+                vand            q10, q15
-+                vand            q0,  q8,  q15
-+                vand            q1,  q12, q15
-+                vand            q2,  q13, q15
++                vbic.u16        q11, #0xfc00
++                vbic.u16        q9,  #0xfc00
++                vshr.u16        q10, #6
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
 +                blt             2f
 +
@@ -59174,10 +63782,177 @@
 +endfunc
 +
 +
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                lsl             r3,  #7
++                sub             r1,  r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++1:
++                vldm            r2,  {q8-q15}
++
++                subs            r5,  #96
++
++                vmovn.u32       d0,  q8
++                vshrn.u32       d2,  q8,  #12
++                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
++
++                add             r2,  r3
++
++                vmovn.u32       d1,  q9
++                vshrn.u32       d3,  q9,  #12
++                vshrn.u32       d5,  q9,  #16
++
++                pld             [r2, #0]
++
++                vshrn.u16       d0,  q0,  #2
++                vmovn.u16       d1,  q1
++                vshrn.u16       d2,  q2,  #6
++
++                vmovn.u32       d16, q10
++                vshrn.u32       d18, q10, #12
++                vshrn.u32       d20, q10, #16
++
++                vmovn.u32       d17, q11
++                vshrn.u32       d19, q11, #12
++                vshrn.u32       d21, q11, #16
++
++                pld             [r2, #64]
++
++                vshrn.u16       d4,  q8,  #2
++                vmovn.u16       d5,  q9
++                vshrn.u16       d6,  q10, #6
++
++                vmovn.u32       d16, q12
++                vshrn.u32       d18, q12, #12
++                vshrn.u32       d20, q12, #16
++
++                vmovn.u32       d17, q13
++                vshrn.u32       d19, q13, #12
++                vshrn.u32       d21, q13, #16
++
++                vshrn.u16       d16, q8,  #2
++                vmovn.u16       d17, q9
++                vshrn.u16       d18, q10, #6
++
++                vmovn.u32       d20, q14
++                vshrn.u32       d22, q14, #12
++                vshrn.u32       d24, q14, #16
++
++                vmovn.u32       d21, q15
++                vshrn.u32       d23, q15, #12
++                vshrn.u32       d25, q15, #16
++
++                vshrn.u16       d20, q10, #2
++                vmovn.u16       d21, q11
++                vshrn.u16       d22, q12, #6
++
++                blt             2f
++
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                vst3.8          {d16, d17, d18}, [r0], r12
++                vst3.8          {d20, d21, d22}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #48-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                beq             11b
++                vmov            q0,  q8
++                vmov            q2,  q10
++                sub             r5,  #48
++                vmov            d2,  d18
++                vmov            d6,  d22
++1:
++                cmp             r5,  #24-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0]!
++                beq             11b
++                vmov            q0,  q2
++                sub             r5,  #24
++                vmov            d2,  d6
++1:
++                cmp             r5,  #12-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
++                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #12
++                vmov            s2,  s3
++                vmov            s4,  s5
++1:
++                cmp             r5,  #6-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                add             r0,  #12
++                beq             11b
++                vshr.u32        d0,  #16
++                sub             r5,  #6
++                vshr.u32        d1,  #16
++                vshr.u32        d2,  #16
++1:
++                cmp             r5, #3-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #8
++                vshr.u32        d1, #8
++1:
++                cmp             r5, #2-96
++                blt             1f
++                vst2.8          {d0[0], d1[0]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
 +
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.h
-@@ -0,0 +1,99 @@
+@@ -0,0 +1,110 @@
 +/*
 +Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -59275,6 +64050,17 @@
 +  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
 +  unsigned int h);            // [sp, #16] -> r7
 +
++void ff_rpi_sand30_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
 +#endif // AVUTIL_ARM_SAND_NEON_H
 +
 --- a/libavutil/frame.c
@@ -59473,13 +64259,23 @@
          if (map->address[i])
              munmap(map->address[i], map->length[i]);
      }
-@@ -178,7 +241,15 @@ static int drm_transfer_get_formats(AVHW
-     if (!pix_fmts)
+@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
+                                     enum AVHWFrameTransferDirection dir,
+                                     enum AVPixelFormat **formats)
+ {
+-    enum AVPixelFormat *pix_fmts;
++    enum AVPixelFormat *p;
+ 
+-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+-    if (!pix_fmts)
++    p = *formats = av_malloc_array(3, sizeof(*p));
++    if (!p)
          return AVERROR(ENOMEM);
  
 -    pix_fmts[0] = ctx->sw_format;
+-    pix_fmts[1] = AV_PIX_FMT_NONE;
 +    // **** Offer native sand too ????
-+    pix_fmts[0] =
++    *p++ =
 +#if CONFIG_SAND
 +        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
 +            AV_PIX_FMT_YUV420P :
@@ -59487,10 +64283,19 @@
 +            AV_PIX_FMT_YUV420P10LE :
 +#endif
 +            ctx->sw_format;
-     pix_fmts[1] = AV_PIX_FMT_NONE;
++
++#if CONFIG_SAND
++    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++        *p++ = AV_PIX_FMT_NV12;
++#endif
  
-     *formats = pix_fmts;
-@@ -197,18 +268,80 @@ static int drm_transfer_data_from(AVHWFr
+-    *formats = pix_fmts;
++    *p = AV_PIX_FMT_NONE;
+     return 0;
+ }
+ 
+@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
      map = av_frame_alloc();
      if (!map)
          return AVERROR(ENOMEM);
@@ -59525,29 +64330,12 @@
 +        const unsigned int w = FFMIN(dst->width, map->width);
 +        const unsigned int h = FFMIN(dst->height, map->height);
 +
-+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else
++        map->crop_top = 0;
++        map->crop_bottom = 0;
++        map->crop_left = 0;
++        map->crop_right = 0;
++
++        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
 +        {
 +            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
 +            err = AVERROR(EINVAL);
@@ -59575,7 +64363,7 @@
  
      err = 0;
  fail:
-@@ -223,7 +356,10 @@ static int drm_transfer_data_to(AVHWFram
+@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
      int err;
  
      if (src->width > hwfc->width || src->height > hwfc->height)
@@ -59874,7 +64662,7 @@
 +
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,356 @@
+@@ -0,0 +1,445 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -60106,6 +64894,75 @@
 +    }
 +}
 +
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint8_t * d = dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = (p3 >> 2) & 0xff;
++            *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = (p3 >> 2) & 0xff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 12) & 0xff;
++        }
++    }
++}
++
++
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -60187,6 +65044,16 @@
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2,  w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -60221,6 +65088,16 @@
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2, w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -60233,7 +65110,7 @@
 +}
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,188 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -60321,6 +65198,11 @@
 +                             unsigned int _x, unsigned int y,
 +                             unsigned int _w, unsigned int h);
 +
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -60419,34 +65301,64 @@
 +
 --- /dev/null
 +++ b/pi-util/BUILD.txt
-@@ -0,0 +1,29 @@
+@@ -0,0 +1,59 @@
 +Building Pi FFmpeg
 +==================
 +
-+Configuration:
-+=============
++Current only building on a Pi is supported.
++This builds ffmpeg the way I've tested it
 +
-+These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu
-+18.04. I would expect most other linux environments to work but I haven't
-+tried them.
++Get all dependencies - the current package dependencies are good enough
 +
-+pi-util/conf_pi2.sh
++$ sudo apt-get build-dep ffmpeg
 +
-+contains suitable options to build the code for Pi2/3.  It expects to find
-+git clones of
++Configure using the pi-util/conf_native.sh script
++-------------------------------------------------
 +
-+https://github.com/raspberrypi/tools
-+https://github.com/raspberrypi/firmware
++This sets the normal release options and creates an ouutput dir to build into
++The directory name will depend on system and options but will be under out/
 +
-+in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
-+lot of history you don't want.
++There are a few choices here
++ --mmal  build including the legacy mmal-based decoders and zero-copy code
++         this requires appropriate libraries which currently will exist for
++         armv7 but not arm64
++ --noshared
++         Build a static image rather than a shared library one.  Static is
++         easier for testing as there is no need to worry about library
++         paths being confused and therefore running the wrong code,  Shared
++         is what is needed, in most cases, when building for use by other
++         programs.
 +
-+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
-+rebuilt.  Otherwise the prebuilt .c & .h files will be used.
-+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
++So for a static build
++---------------------
 +
-+pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
-+H265 QPU acceleration is broken on Pi1 and so it is disabled.
++$ pi-util/conf_native.sh --noshared
++
++$ make -j8 -C out/<wherever the script said it was building to>
++
++You can now run ffmpeg directly from where it was built
++
++For a shared build
++------------------
++
++$ pi-util/conf_native.sh
++
++You will normally want an install target if shared. Note that the script has
++set this up to be generated in out/<builddir>/install, you don't have to worry
++about overwriting your system libs.
++
++$ make -j8 -C out/<builddir> install
++
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built or install the image on the system - you have to be careful to get rid
++of all other ffmpeg libs or confusion may result.  There is a little script
++that wipes all other versions - obviously use with care!
++
++$ sudo pi-util/clean_usr_libs.sh
++
++Then simply copying from the install to /usr works
++
++$ sudo cp -r out/<builddir>/install/* /usr
 +
 +
 --- /dev/null
@@ -60608,29 +65520,32 @@
 +
 --- /dev/null
 +++ b/pi-util/clean_usr_libs.sh
-@@ -0,0 +1,23 @@
+@@ -0,0 +1,26 @@
 +set -e
 +U=/usr/lib/arm-linux-gnueabihf
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/arm-linux-gnueabihf/neon/vfp
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/aarch64-linux-gnu
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +
 --- /dev/null
 +++ b/pi-util/conf_arm64_native.sh
@@ -61177,61 +66092,90 @@
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 --- /dev/null
 +++ b/pi-util/conf_native.sh
-@@ -0,0 +1,82 @@
+@@ -0,0 +1,106 @@
 +echo "Configure for native build"
 +
 +FFSRC=`pwd`
 +MC=`dpkg --print-architecture`
++BUILDBASE=$FFSRC/out
 +
 +#RPI_KEEPS="-save-temps=obj"
 +RPI_KEEPS=""
 +
++NOSHARED=
++MMAL=
++
++while [ "$1" != "" ] ; do
++    case $1 in
++	--noshared)
++	    NOSHARED=1
++	    ;;
++	--mmal)
++	    MMAL=1
++	    ;;
++	*)
++	    echo "Usage $0: [--noshared] [--mmal]"
++	    exit 1
++	    ;;
++    esac
++    shift
++done
++
++
++MCOPTS=
++RPI_INCLUDES=
++RPI_LIBDIRS=
++RPI_DEFINES=
++RPI_EXTRALIBS=
++
 +if [ "$MC" == "arm64" ]; then
 +  echo "M/C aarch64"
 +  A=aarch64-linux-gnu
 +  B=arm64
-+  MCOPTS=
-+  RPI_INCLUDES=
-+  RPI_LIBDIRS=
-+  RPI_DEFINES=
-+  RPI_EXTRALIBS=
-+  RPIOPTS="--disable-mmal --enable-sand"
 +elif [ "$MC" == "armhf" ]; then
 +  echo "M/C armv7"
 +  A=arm-linux-gnueabihf
 +  B=armv7
 +  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-+  RPI_OPT_VC=/opt/vc
-+  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
-+  RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
-+  RPIOPTS="--enable-mmal --enable-rpi"
++  RPI_DEFINES=-mfpu=neon-vfpv4
 +else
 +  echo Unexpected architecture $MC
 +  exit 1
 +fi
 +
++if [ $MMAL ]; then
++  RPI_OPT_VC=/opt/vc
++  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
++  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
++  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
++  RPIOPTS="--enable-mmal --enable-rpi"
++else
++  RPIOPTS="--disable-mmal --enable-sand"
++fi
++
 +C=`lsb_release -sc`
 +V=`cat RELEASE`
 +
 +SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
++if [ $NOSHARED ]; then
 +  SHARED_LIBS="--disable-shared"
-+  OUT=out/$B-$C-$V-static-rel
++  OUT=$BUILDBASE/$B-$C-$V-static-rel
 +  echo Static libs
 +else
 +  echo Shared libs
-+  OUT=out/$B-$C-$V-shared-rel
++  OUT=$BUILDBASE/$B-$C-$V-shared-rel
 +fi
 +
-+USR_PREFIX=$FFSRC/$OUT/install
++USR_PREFIX=$OUT/install
 +LIB_PREFIX=$USR_PREFIX/lib/$A
 +INC_PREFIX=$USR_PREFIX/include/$A
 +
 +echo Destination directory: $OUT
-+mkdir -p $FFSRC/$OUT
-+cd $FFSRC/$OUT
++mkdir -p $OUT
++# Nothing under here need worry git - including this .gitignore!
++echo "**" > $BUILDBASE/.gitignore
++cd $OUT
 +
 +$FFSRC/configure \
 + --prefix=$USR_PREFIX\
@@ -61242,10 +66186,8 @@
 + --disable-thumb\
 + --enable-v4l2-request\
 + --enable-libdrm\
-+ --enable-epoxy\
-+ --enable-libudev\
-+ --enable-vout-drm\
 + --enable-vout-egl\
++ --enable-vout-drm\
 + $SHARED_LIBS\
 + $RPIOPTS\
 + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
@@ -61254,111 +66196,6 @@
 + --extra-libs="$RPI_EXTRALIBS"\
 + --extra-version="rpi"
 +
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,39 @@
-+echo "Configure for Pi1"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
-+  SHARED_LIBS="--disable-shared"
-+  echo Static libs
-+else
-+  echo Shared libs
-+fi
-+
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=arm\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ $SHARED_LIBS\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,57 @@
-+echo "Configure for Pi2/3"
-+
-+FFSRC=`pwd`
-+
-+RPI_TOOLROOT=$FFSRC/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$FFSRC/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
-+  SHARED_LIBS="--disable-shared"
-+  OUT=out/x-armv7-static-rel
-+  echo Static libs
-+else
-+  echo Shared libs
-+  OUT=out/x-armv7-shared-rel
-+fi
-+
-+USR_PREFIX=$FFSRC/$OUT/install
-+LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
-+INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
-+
-+mkdir -p $FFSRC/$OUT
-+cd $FFSRC/$OUT
-+
-+$FFSRC/configure --enable-cross-compile\
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ $SHARED_LIBS\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-shared\
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
@@ -62042,3 +66879,630 @@
 +
 +    do_logparse(args.logfile)
 +
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
+ AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
+ 
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -121,6 +121,9 @@ static const struct {
+     #if CONFIG_HUFFYUV_DECODER
+         { "huffyuvdsp", checkasm_check_huffyuvdsp },
+     #endif
++    #if CONFIG_IDCTDSP
++        { "idctdsp", checkasm_check_idctdsp },
++    #endif
+     #if CONFIG_JPEG2000_DECODER
+         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+     #endif
+@@ -145,6 +148,9 @@ static const struct {
+     #if CONFIG_V210_ENCODER
+         { "v210enc", checkasm_check_v210enc },
+     #endif
++    #if CONFIG_VC1DSP
++        { "vc1dsp", checkasm_check_vc1dsp },
++    #endif
+     #if CONFIG_VP8DSP
+         { "vp8dsp", checkasm_check_vp8dsp },
+     #endif
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
+ void checkasm_check_hevc_idct(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+--- /dev/null
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/idctdsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size)          \
++    do {                                        \
++        int i;                                  \
++        for (i = 0; i < size; ++i) {            \
++            uint16_t r = rnd() % 0x201 - 0x100; \
++            AV_WN16A(name##0 + i, r);           \
++            AV_WN16A(name##1 + i, r);           \
++        }                                       \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++static void check_add_put_clamped(void)
++{
++    /* Source buffers are only as big as needed, since any over-read won't affect results */
++    LOCAL_ALIGNED_16(int16_t, src0, [64]);
++    LOCAL_ALIGNED_16(int16_t, src1, [64]);
++    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++    AVCodecContext avctx = { 0 };
++    IDCTDSPContext h;
++
++    const test tests[] = {
++        IDCTDSP_TEST(add_pixels_clamped)
++        IDCTDSP_TEST(put_pixels_clamped)
++        IDCTDSP_TEST(put_signed_pixels_clamped)
++    };
++
++    ff_idctdsp_init(&h, &avctx);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "idctdsp.%s", tests[t].name)) {
++            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++            RANDOMIZE_BUFFER16(src, 64);
++            RANDOMIZE_BUFFER8(dst, 10 * 24);
++            call_ref(src0, dst0 + 24 + 8, 24);
++            call_new(src1, dst1 + 24 + 8, 24);
++            if (memcmp(dst0, dst1, 10 * 24))
++                fail();
++            bench_new(src1, dst1 + 24 + 8, 24);
++        }
++    }
++}
++
++void checkasm_check_idctdsp(void)
++{
++    check_add_put_clamped();
++    report("idctdsp");
++}
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++    int width;
++    int height;
++} test;
++
++typedef struct matrix {
++    size_t width;
++    size_t height;
++    float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++        12,  12,  12,  12,  12,  12,  12,  12,
++        16,  15,   9,   4,  -4,  -9, -15, -16,
++        16,   6,  -6, -16, -16,  -6,   6,  16,
++        15,  -4, -16,  -9,   9,  16,   4, -15,
++        12, -12, -12,  12,  12, -12, -12,  12,
++         9, -16,   4,  15, -15,  -4,  16,  -9,
++         6, -16,  16,  -6,  -6,  16, -16,   6,
++         4,  -9,  15, -16,  16, -15,   9,  -4
++} };
++
++static const matrix T4 = { 4, 4, {
++        17,  17,  17,  17,
++        22,  10, -10, -22,
++        17, -17, -17,  17,
++        10, -22,  22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++        12,  16,  16,  15,  12,   9,   6,   4,
++        12,  15,   6,  -4, -12, -16, -16,  -9,
++        12,   9,  -6, -16, -12,   4,  16,  15,
++        12,   4, -16,  -9,  12,  15,  -6, -16,
++        12,  -4, -16,   9,  12, -15,  -6,  16,
++        12,  -9,  -6,  16, -12,  -4,  16, -15,
++        12, -15,   6,   4, -12,  16, -16,   9,
++        12, -16,  16, -15,  12,  -9,   6,  -4
++} };
++
++static const matrix T4t = { 4, 4, {
++        17,  22,  17,  10,
++        17,  10, -17, -22,
++        17, -10, -17,  22,
++        17, -22,  17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++    if (out == NULL) {
++        fprintf(stderr, "Memory allocation failure\n");
++        exit(EXIT_FAILURE);
++    }
++    out->width = width;
++    out->height = height;
++    return out;
++}
++
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++    matrix *out;
++    if (a->width != b->height) {
++        fprintf(stderr, "Incompatible multiplication\n");
++        exit(EXIT_FAILURE);
++    }
++    out = new_matrix(b->width, a->height);
++    for (int j = 0; j < out->height; ++j)
++        for (int i = 0; i < out->width; ++i) {
++            float sum = 0;
++            for (int k = 0; k < a->width; ++k)
++                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++            out->d[j * out->width + i] = sum;
++        }
++    return out;
++}
++
++static void normalise(matrix *a)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p *= 64;
++            if (a->height == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++            if (a->width == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
++        }
++}
++
++static void divide_and_round_nearest(matrix *a, float by)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p = rintf(*p / by);
++        }
++}
++
++static void tweak(matrix *a)
++{
++    for (int j = 4; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p += 1;
++        }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++    matrix *raw, *tmp, *D, *E, *R;
++    raw = new_matrix(width, height);
++    for (int i = 0; i < width * height; ++i)
++        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++    tmp = multiply(height == 8 ? &T8 : &T4, raw);
++    D = multiply(tmp, width == 8 ? &T8t : &T4t);
++    normalise(D);
++    divide_and_round_nearest(D, 1);
++    for (int i = 0; i < width * height; ++i) {
++        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    }
++    E = multiply(D, width == 8 ? &T8 : &T4);
++    divide_and_round_nearest(E, 8);
++    for (int i = 0; i < width * height; ++i)
++        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    R = multiply(height == 8 ? &T8t : &T4t, E);
++    tweak(R);
++    divide_and_round_nearest(R, 128);
++    for (int i = 0; i < width * height; ++i)
++        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            av_free(R);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    av_free(raw);
++    av_free(tmp);
++    av_free(E);
++    av_free(R);
++    return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size)        \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint16_t r = rnd();               \
++            AV_WN16A(name##0 + i, r);         \
++            AV_WN16A(name##1 + i, r);         \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
++    do {                                            \
++        uint8_t *p##0 = name##0, *p##1 = name##1;   \
++        int i = (size);                             \
++        while (i-- > 0) {                           \
++            int x = 0x80 | (rnd() & 0x7F);          \
++            x >>= rnd() % 9;                        \
++            if (rnd() & 1)                          \
++                x = -x;                             \
++            *p##1++ = *p##0++ = 0x80 + x;           \
++        }                                           \
++    } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size.
++     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++     * are stored in column-major order, and the outputs are written back
++     * to the input buffer, so we oversize it slightly to catch overruns. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++        matrix *coeffs;
++        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++        for (int j = 0; j < 8; ++j)
++            for (int i = 0; i < 8; ++i) {
++                int idx = 8 + i * 8 + j;
++                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++            }
++        call_ref(inv_trans_in0 + 8);
++        call_new(inv_trans_in1 + 8);
++        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
++            fail();
++        bench_new(inv_trans_in1 + 8);
++        av_free(coeffs);
++    }
++}
++
++static void check_inv_trans_adding(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++     * added with saturation to an array of unsigned 8-bit values. Oversize
++     * this by 8 samples left and right and one row above and below. */
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            matrix *coeffs;
++            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++            for (int j = 0; j < tests[t].height; ++j)
++                for (int i = 0; i < tests[t].width; ++i) {
++                    int idx = j * 8 + i;
++                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++                }
++            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++                fail();
++            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++            av_free(coeffs);
++        }
++    }
++}
++
++static void check_loop_filter(void)
++{
++    /* Deblocking filter buffers are big enough to hold a 16x16 block,
++     * plus 16 columns left and 4 rows above to hold filter inputs
++     * (depending on whether v or h neighbouring block edge, oversized
++     * horizontally to maintain 16-byte alignment) plus 16 columns and
++     * 4 rows below to catch write overflows */
++    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_TEST(vc1_v_loop_filter4)
++        VC1DSP_TEST(vc1_h_loop_filter4)
++        VC1DSP_TEST(vc1_v_loop_filter8)
++        VC1DSP_TEST(vc1_h_loop_filter8)
++        VC1DSP_TEST(vc1_v_loop_filter16)
++        VC1DSP_TEST(vc1_h_loop_filter16)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            for (int count = 1000; count > 0; --count) {
++                int pq = rnd() % 31 + 1;
++                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++                    fail();
++            }
++        }
++        for (int j = 0; j < 24; ++j)
++            for (int i = 0; i < 48; ++i)
++                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++    }
++}
++
++#define TEST_UNESCAPE                                                                               \
++    do {                                                                                            \
++        for (int count = 100; count > 0; --count) {                                                 \
++            escaped_offset = rnd() & 7;                                                             \
++            unescaped_offset = rnd() & 7;                                                           \
++            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
++            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
++            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
++                fail();                                                                             \
++        }                                                                                           \
++    } while (0)
++
++static void check_unescape(void)
++{
++    /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++        /* Test data which consists of escapes sequences packed as tightly as possible */
++        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++        TEST_UNESCAPE;
++
++        /* Test random data */
++        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Test data with escape sequences at random intervals */
++        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++            int gap, gap_msb;
++            escaped1[x+0] = escaped0[x+0] = 0;
++            escaped1[x+1] = escaped0[x+1] = 0;
++            escaped1[x+2] = escaped0[x+2] = 3;
++            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++            gap_msb = 2u << (rnd() % 8);
++            gap = (rnd() &~ -gap_msb) | gap_msb;
++            x += gap;
++        }
++        TEST_UNESCAPE;
++
++        /* Test data which is known to contain no escape sequences */
++        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Benchmark the no-escape-sequences case */
++        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++    }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++    check_inv_trans_inplace();
++    check_inv_trans_adding();
++    report("inv_trans");
++
++    check_loop_filter();
++    report("loop_filter");
++
++    check_unescape();
++    report("unescape_buffer");
++}
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+                 fate-checkasm-hevc_add_res                              \
+                 fate-checkasm-hevc_idct                                 \
+                 fate-checkasm-hevc_sao                                  \
++                fate-checkasm-idctdsp                                   \
+                 fate-checkasm-jpeg2000dsp                               \
+                 fate-checkasm-llviddsp                                  \
+                 fate-checkasm-llviddspenc                               \
+@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+                 fate-checkasm-sw_scale                                  \
+                 fate-checkasm-v210dec                                   \
+                 fate-checkasm-v210enc                                   \
++                fate-checkasm-vc1dsp                                    \
+                 fate-checkasm-vf_blend                                  \
+                 fate-checkasm-vf_colorspace                             \
+                 fate-checkasm-vf_eq                                     \
diff --git a/.github/scripts/Linux/arm/ffmpeg-arm-patches/fix_flags.diff b/.github/scripts/Linux/arm/ffmpeg-arm-patches/fix_flags.diff
index 08f62ad5f..6175d5be6 100644
--- a/.github/scripts/Linux/arm/ffmpeg-arm-patches/fix_flags.diff
+++ b/.github/scripts/Linux/arm/ffmpeg-arm-patches/fix_flags.diff
@@ -1,6 +1,6 @@
 --- a/configure
 +++ b/configure
-@@ -6467,11 +6467,9 @@ enabled mbedtls           && { check_pkg
+@@ -6471,11 +6471,9 @@ enabled mbedtls           && { check_pkg
                                 die "ERROR: mbedTLS not found"; }
  enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
  ( enabled rpi ||