From aaac42ee3a62831f2f00190e9578f0840b0001bd Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 14 Mar 2024 11:26:34 +0000
Subject: [PATCH 001/391] Propagate offset constant into Neon averaging helper
 functions

Averaging helper functions for high bitdepth compound convolutions
already have a specialized implementation on the bitdepth, but the
offset was still computed in the calling convolution functions. Move the
computation inside the averaging function, allowing for some terms to
become known at compile time.

Change-Id: I8ba8cb1bb0c68ea9321d56e4da9027a21243efda
---
 .../arm/highbd_compound_convolve_neon.c       | 43 +++++++------------
 .../arm/highbd_compound_convolve_neon.h       | 33 ++++++++++----
 .../arm/highbd_compound_convolve_sve2.c       | 11 ++---
 3 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index 05773393d7..c93a1d4e28 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -486,9 +486,6 @@ void av1_highbd_dist_wtd_convolve_x_neon(
   const int im_stride = MAX_SB_SIZE;
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
-                         (1 << (offset_bits - conv_params->round_1 - 1));
   const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
                               (1 << (bd + FILTER_BITS)) +
                               (1 << (bd + FILTER_BITS - 1));
@@ -511,10 +508,10 @@ void av1_highbd_dist_wtd_convolve_x_neon(
       }
       if (conv_params->use_dist_wtd_comp_avg) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
-                                         w, h, conv_params, offset_avg, bd);
+                                         w, h, conv_params);
       } else {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                conv_params, offset_avg, bd);
+                                conv_params);
       }
     } else {
       if (x_filter_taps <= 6 && w != 4) {
@@ -538,10 +535,10 @@ void av1_highbd_dist_wtd_convolve_x_neon(
       }
       if (conv_params->use_dist_wtd_comp_avg) {
         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, offset_avg, bd);
+                                      h, conv_params, bd);
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             conv_params, offset_avg, bd);
+                             conv_params, bd);
       }
     } else {
       if (x_filter_taps <= 6 && w != 4) {
@@ -891,9 +888,6 @@ void av1_highbd_dist_wtd_convolve_y_neon(
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = filter_params_y->taps / 2 - 1;
   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
-                               (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
                                 (1 << (bd + FILTER_BITS)) +
                                 (1 << (bd + FILTER_BITS - 1));
@@ -916,11 +910,10 @@ void av1_highbd_dist_wtd_convolve_y_neon(
       }
       if (conv_params->use_dist_wtd_comp_avg) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
-                                         w, h, conv_params, round_offset_avg,
-                                         bd);
+                                         w, h, conv_params);
       } else {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                conv_params, round_offset_avg, bd);
+                                conv_params);
       }
     } else {
       if (y_filter_taps <= 6) {
@@ -946,10 +939,10 @@ void av1_highbd_dist_wtd_convolve_y_neon(
       }
       if (conv_params->use_dist_wtd_comp_avg) {
         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, round_offset_avg, bd);
+                                      h, conv_params, bd);
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             conv_params, round_offset_avg, bd);
+                             conv_params, bd);
       }
     } else {
       if (y_filter_taps <= 6) {
@@ -1028,18 +1021,18 @@ void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
     if (conv_params->use_dist_wtd_comp_avg) {
       if (bd == 12) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
-                                         w, h, conv_params, round_offset, bd);
+                                         w, h, conv_params);
       } else {
         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, round_offset, bd);
+                                      h, conv_params, bd);
       }
     } else {
       if (bd == 12) {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                conv_params, round_offset, bd);
+                                conv_params);
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             conv_params, round_offset, bd);
+                             conv_params, bd);
       }
     }
   }
@@ -1692,9 +1685,6 @@ void av1_highbd_dist_wtd_convolve_2d_neon(
       (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int round_offset_conv_y = (1 << y_offset_bits);
-  const int round_offset_avg =
-      ((1 << (y_offset_bits - conv_params->round_1)) +
-       (1 << (y_offset_bits - conv_params->round_1 - 1)));
 
   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
 
@@ -1755,19 +1745,18 @@ void av1_highbd_dist_wtd_convolve_2d_neon(
     if (conv_params->use_dist_wtd_comp_avg) {
       if (bd == 12) {
         highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
-                                         w, h, conv_params, round_offset_avg,
-                                         bd);
+                                         w, h, conv_params);
       } else {
         highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
-                                      h, conv_params, round_offset_avg, bd);
+                                      h, conv_params, bd);
       }
     } else {
       if (bd == 12) {
         highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
-                                conv_params, round_offset_avg, bd);
+                                conv_params);
       } else {
         highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
-                             conv_params, round_offset_avg, bd);
+                             conv_params, bd);
       }
     }
   }
diff --git a/av1/common/arm/highbd_compound_convolve_neon.h b/av1/common/arm/highbd_compound_convolve_neon.h
index efe70440fa..c9344f3adf 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.h
+++ b/av1/common/arm/highbd_compound_convolve_neon.h
@@ -24,12 +24,15 @@
 static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
                                            int src_stride, uint16_t *dst_ptr,
                                            int dst_stride, int w, int h,
-                                           ConvolveParams *conv_params,
-                                           const int offset, const int bd) {
+                                           ConvolveParams *conv_params) {
+  const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2;
+  const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                     (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
   CONV_BUF_TYPE *ref_ptr = conv_params->dst;
   const int ref_stride = conv_params->dst_stride;
-  const uint16x4_t offset_vec = vdup_n_u16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset);
+  const uint16x8_t max = vdupq_n_u16((1 << 12) - 1);
 
   if (w == 4) {
     do {
@@ -86,10 +89,14 @@ static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
                                         uint16_t *dst_ptr, int dst_stride,
                                         int w, int h,
                                         ConvolveParams *conv_params,
-                                        const int offset, const int bd) {
+                                        const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                     (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
   CONV_BUF_TYPE *ref_ptr = conv_params->dst;
   const int ref_stride = conv_params->dst_stride;
-  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset);
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
   if (w == 4) {
@@ -145,11 +152,15 @@ static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
 
 static INLINE void highbd_12_dist_wtd_comp_avg_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+    int w, int h, ConvolveParams *conv_params) {
+  const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2;
+  const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                     (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
   CONV_BUF_TYPE *ref_ptr = conv_params->dst;
   const int ref_stride = conv_params->dst_stride;
   const uint32x4_t offset_vec = vdupq_n_u32(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const uint16x8_t max = vdupq_n_u16((1 << 12) - 1);
   uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
   uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
 
@@ -212,7 +223,11 @@ static INLINE void highbd_12_dist_wtd_comp_avg_neon(
 
 static INLINE void highbd_dist_wtd_comp_avg_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+    int w, int h, ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                     (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
   CONV_BUF_TYPE *ref_ptr = conv_params->dst;
   const int ref_stride = conv_params->dst_stride;
   const uint32x4_t offset_vec = vdupq_n_u32(offset);
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index f7eda226ef..b36e01f2fc 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -223,9 +223,6 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
   const int im_stride = MAX_SB_SIZE;
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
-                         (1 << (offset_bits - conv_params->round_1 - 1));
   const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
                               (1 << (bd + FILTER_BITS)) +
                               (1 << (bd + FILTER_BITS - 1));
@@ -249,21 +246,21 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
     if (conv_params->use_dist_wtd_comp_avg) {
       if (bd == 12) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
-                                         w, h, conv_params, offset_avg, bd);
+                                         w, h, conv_params);
 
       } else {
         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, offset_avg, bd);
+                                      h, conv_params, bd);
       }
 
     } else {
       if (bd == 12) {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                conv_params, offset_avg, bd);
+                                conv_params);
 
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             conv_params, offset_avg, bd);
+                             conv_params, bd);
       }
     }
   } else {
-- 
GitLab


From 416c1a9ff3198f9f4833ba6809ca65eba649273f Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 7 Mar 2024 12:14:27 +0000
Subject: [PATCH 002/391] Specialize 8-tap HBD dist_wtd_convolve_x_sve2 on
 bitdepth

The rounding value at the end of the convolution depends on the bitdepth
(8/10, or 12). Add 2 specialized versions of the function, so that we
know the rounding value at compile time and therefore use only one
instruction instead of two to perform the final rounding and narrowing
step. This gives up to 10% uplift over the non-specialized version.

Change-Id: Iba3ef98eb82e44d0f67860721a957fb73589f71a
---
 .../arm/highbd_compound_convolve_sve2.c       | 131 +++++++++++++-----
 1 file changed, 100 insertions(+), 31 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index b36e01f2fc..8977538c4e 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -31,8 +31,9 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
 };
 
-static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
-                                       int64x2_t offset, int32x4_t shift) {
+static INLINE uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset) {
   int64x2_t sum[8];
   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
   sum[1] = aom_sdotq_s16(offset, s0[1], filter);
@@ -51,22 +52,85 @@ static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
 
-  sum0123 = vshlq_s32(sum0123, shift);
-  sum4567 = vshlq_s32(sum4567, shift);
+  return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2),
+                      vqrshrun_n_s32(sum4567, ROUND0_BITS + 2));
+}
 
-  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+static INLINE void highbd_12_dist_wtd_convolve_x_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr) {
+  const int64x1_t offset_vec =
+      vcreate_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1)));
+  const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0));
+
+  const int16x8_t filter = vld1q_s16(x_filter_ptr);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int w = width;
+
+    do {
+      int16x8_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset_lo);
+      uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset_lo);
+      uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset_lo);
+      uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset_lo);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
 }
 
-static INLINE void highbd_dist_wtd_convolve_x_sve2(
+static INLINE uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
+                                              int64x2_t offset) {
+  int64x2_t sum[8];
+  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
+  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
+  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
+  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
+  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
+  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
+  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
+  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+  sum[4] = vpaddq_s64(sum[4], sum[5]);
+  sum[6] = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
+
+  return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS),
+                      vqrshrun_n_s32(sum4567, ROUND0_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int offset) {
-  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
-  const int64x2_t offset_vec = vdupq_n_s64(offset);
+    int width, int height, const int16_t *x_filter_ptr, const int bd) {
+  const int64x1_t offset_vec =
+      vcreate_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)));
+  const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0));
 
-  const int64x2_t offset_lo =
-      vcombine_s64(vget_low_s64(offset_vec), vdup_n_s64(0));
   const int16x8_t filter = vld1q_s16(x_filter_ptr);
+
   do {
     const int16_t *s = (const int16_t *)src;
     uint16_t *d = dst;
@@ -83,10 +147,10 @@ static INLINE void highbd_dist_wtd_convolve_x_sve2(
       load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
                    &s3[4], &s3[5], &s3[6], &s3[7]);
 
-      uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, shift);
-      uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, shift);
-      uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, shift);
-      uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, shift);
+      uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset_lo);
+      uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset_lo);
+      uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset_lo);
+      uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset_lo);
 
       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -142,8 +206,10 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int offset) {
-  // This shim allows to do only one rounding shift instead of two.
+    ConvolveParams *conv_params, const int bd) {
+  const int offset = (1 << (conv_params->round_0 - 1)) +
+                     (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1));
+
   const int64x2_t offset_s64 = vdupq_n_s64(offset);
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
 
@@ -223,9 +289,6 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
   const int im_stride = MAX_SB_SIZE;
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
-  const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
-                              (1 << (bd + FILTER_BITS)) +
-                              (1 << (bd + FILTER_BITS - 1));
 
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
@@ -236,13 +299,16 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
     if (x_filter_taps <= 4) {
       highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block,
                                            im_stride, w, h, x_filter_ptr,
-                                           conv_params, offset_convolve);
+                                           conv_params, bd);
     } else {
-      highbd_dist_wtd_convolve_x_sve2(src, src_stride, im_block, im_stride, w,
-                                      h, x_filter_ptr, conv_params,
-                                      offset_convolve);
+      if (bd == 12) {
+        highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
+                                                im_stride, w, h, x_filter_ptr);
+      } else {
+        highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
+                                             im_stride, w, h, x_filter_ptr, bd);
+      }
     }
-
     if (conv_params->use_dist_wtd_comp_avg) {
       if (bd == 12) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
@@ -257,7 +323,6 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
       if (bd == 12) {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
                                 conv_params);
-
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
                              conv_params, bd);
@@ -267,11 +332,15 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
     if (x_filter_taps <= 4) {
       highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, dst16,
                                            dst16_stride, w, h, x_filter_ptr,
-                                           conv_params, offset_convolve);
+                                           conv_params, bd);
     } else {
-      highbd_dist_wtd_convolve_x_sve2(src, src_stride, dst16, dst16_stride, w,
-                                      h, x_filter_ptr, conv_params,
-                                      offset_convolve);
+      if (bd == 12) {
+        highbd_12_dist_wtd_convolve_x_8tap_sve2(
+            src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr);
+      } else {
+        highbd_dist_wtd_convolve_x_8tap_sve2(
+            src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd);
+      }
     }
   }
 }
-- 
GitLab


From 8cb23f865eb58e2772c098d8044b4edfdfbdd03d Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 8 Mar 2024 16:13:57 +0000
Subject: [PATCH 003/391] Specialize 4-tap HBD dist_wtd_convolve_x_sve2 on
 bitdepth

The rounding value at the end of the convolution depends on the bitdepth
(8/10, or 12). Add 2 specialized versions of the function, so that we
know the rounding value at compile time and therefore use only one
instruction instead of two to perform the final rounding and narrowing
step. This gives up to 20% uplift over the non-specialized version.

Change-Id: Id83a813ddefbb61704adf4b59d93f6910160f22d
---
 .../arm/highbd_compound_convolve_sve2.c       | 212 +++++++++++++-----
 1 file changed, 152 insertions(+), 60 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index 8977538c4e..dc983c5f8b 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -164,9 +164,15 @@ static INLINE void highbd_dist_wtd_convolve_x_8tap_sve2(
   } while (height != 0);
 }
 
-static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
-                                       int64x2_t offset, int32x4_t shift,
-                                       uint16x8x2_t permute_tbl) {
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
+  0, 2, 4, 6, 1, 3, 5, 7,
+};
+// clang-format on
+
+static INLINE uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter,
+                                                 int64x2_t offset,
+                                                 uint16x8x2_t permute_tbl) {
   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
   int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
 
@@ -174,44 +180,124 @@ static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
   int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
 
   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  sum0123 = vshlq_s32(sum0123, shift);
 
-  return vqmovun_s32(sum0123);
+  return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
-                                       int64x2_t offset, int32x4_t shift,
-                                       uint16x8_t tbl) {
+static INLINE uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset,
+                                                 uint16x8_t tbl) {
   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
   int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
   int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
   int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
 
   int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
-  sum0415 = vshlq_s32(sum0415, shift);
-
   int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
-  sum2637 = vshlq_s32(sum2637, shift);
 
-  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0415), vqmovun_s32(sum2637));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS + 2),
+                                vqrshrun_n_s32(sum2637, ROUND0_BITS + 2));
   return aom_tbl_u16(res, tbl);
 }
 
-// clang-format off
-DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
-  0, 2, 4, 6, 1, 3, 5, 7,
-};
-// clang-format on
+static INLINE void highbd_12_dist_wtd_convolve_x_4tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr) {
+  const int64x2_t offset =
+      vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1)));
+
+  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+  const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
+
+  if (width == 4) {
+    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
+
+    const int16_t *s = (const int16_t *)(src);
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl);
+      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl);
+      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl);
+      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
+
+    do {
+      const int16_t *s = (const int16_t *)(src);
+      uint16_t *d = dst;
+      int w = width;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx);
+        uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx);
+        uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx);
+        uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter,
+                                              int64x2_t offset,
+                                              uint16x8x2_t permute_tbl) {
+  int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
+  int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
+
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  return vqrshrun_n_s32(sum0123, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
+                                              int64x2_t offset,
+                                              uint16x8_t tbl) {
+  int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
+  int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
+  int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
+  int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
+
+  int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS),
+                                vqrshrun_n_s32(sum2637, ROUND0_BITS));
+  return aom_tbl_u16(res, tbl);
+}
 
 static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int bd) {
-  const int offset = (1 << (conv_params->round_0 - 1)) +
-                     (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1));
-
-  const int64x2_t offset_s64 = vdupq_n_s64(offset);
-  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+    int width, int height, const int16_t *x_filter_ptr, const int bd) {
+  const int64x2_t offset =
+      vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)));
 
   const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
@@ -225,10 +311,10 @@ static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2(
       int16x8_t s0, s1, s2, s3;
       load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-      uint16x4_t d0 = convolve4_4_x(s0, filter, offset_s64, shift, permute_tbl);
-      uint16x4_t d1 = convolve4_4_x(s1, filter, offset_s64, shift, permute_tbl);
-      uint16x4_t d2 = convolve4_4_x(s2, filter, offset_s64, shift, permute_tbl);
-      uint16x4_t d3 = convolve4_4_x(s3, filter, offset_s64, shift, permute_tbl);
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl);
 
       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
@@ -251,10 +337,10 @@ static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2(
         load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
         load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
 
-        uint16x8_t d0 = convolve4_8_x(s0, filter, offset_s64, shift, idx);
-        uint16x8_t d1 = convolve4_8_x(s1, filter, offset_s64, shift, idx);
-        uint16x8_t d2 = convolve4_8_x(s2, filter, offset_s64, shift, idx);
-        uint16x8_t d3 = convolve4_8_x(s3, filter, offset_s64, shift, idx);
+        uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx);
+        uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx);
+        uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx);
+        uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -295,48 +381,54 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
 
   src -= horiz_offset;
 
-  if (conv_params->do_average) {
-    if (x_filter_taps <= 4) {
-      highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block,
-                                           im_stride, w, h, x_filter_ptr,
-                                           conv_params, bd);
-    } else {
-      if (bd == 12) {
-        highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
+  if (bd == 12) {
+    if (conv_params->do_average) {
+      if (x_filter_taps <= 4) {
+        highbd_12_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block,
                                                 im_stride, w, h, x_filter_ptr);
       } else {
-        highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
-                                             im_stride, w, h, x_filter_ptr, bd);
+        highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
+                                                im_stride, w, h, x_filter_ptr);
       }
-    }
-    if (conv_params->use_dist_wtd_comp_avg) {
-      if (bd == 12) {
+
+      if (conv_params->use_dist_wtd_comp_avg) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
                                          w, h, conv_params);
 
       } else {
-        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, bd);
-      }
-
-    } else {
-      if (bd == 12) {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
                                 conv_params);
+      }
+    } else {
+      if (x_filter_taps <= 4) {
+        highbd_12_dist_wtd_convolve_x_4tap_sve2(
+            src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr);
       } else {
-        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             conv_params, bd);
+        highbd_12_dist_wtd_convolve_x_8tap_sve2(
+            src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr);
       }
     }
   } else {
-    if (x_filter_taps <= 4) {
-      highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, dst16,
-                                           dst16_stride, w, h, x_filter_ptr,
-                                           conv_params, bd);
+    if (conv_params->do_average) {
+      if (x_filter_taps <= 4) {
+        highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block,
+                                             im_stride, w, h, x_filter_ptr, bd);
+      } else {
+        highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block,
+                                             im_stride, w, h, x_filter_ptr, bd);
+      }
+
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, bd);
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, bd);
+      }
     } else {
-      if (bd == 12) {
-        highbd_12_dist_wtd_convolve_x_8tap_sve2(
-            src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr);
+      if (x_filter_taps <= 4) {
+        highbd_dist_wtd_convolve_x_4tap_sve2(
+            src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd);
       } else {
         highbd_dist_wtd_convolve_x_8tap_sve2(
             src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd);
-- 
GitLab


From 3bdf0fc289783bf14caa9ad724f9fd3ad9b46435 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 23 Feb 2024 16:27:34 +0000
Subject: [PATCH 004/391] Add 4-tap specialization of
 aom_highbd_convolve8_horiz_sve

This function is run mostly with 2-tap and 4-tap filters, so add a
specialised path that allows to use half as many instructions as the
generic 8-tap path for these filters.

Change-Id: I2804cdfcfffc993b33adcfe1a02e40c724dbce08
---
 aom_dsp/arm/aom_filter.h           |  33 +++++++
 aom_dsp/arm/highbd_convolve8_sve.c | 153 ++++++++++++++++++++++++++---
 2 files changed, 170 insertions(+), 16 deletions(-)
 create mode 100644 aom_dsp/arm/aom_filter.h

diff --git a/aom_dsp/arm/aom_filter.h b/aom_dsp/arm/aom_filter.h
new file mode 100644
index 0000000000..9972d064fc
--- /dev/null
+++ b/aom_dsp/arm/aom_filter.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_AOM_FILTER_H_
+#define AOM_AOM_DSP_ARM_AOM_FILTER_H_
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE int get_filter_taps_convolve8(const int16_t *filter) {
+  if (filter[0] | filter[7]) {
+    return 8;
+  }
+  if (filter[1] | filter[6]) {
+    return 6;
+  }
+  if (filter[2] | filter[5]) {
+    return 4;
+  }
+  return 2;
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_FILTER_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 46131b9736..189d11b14d 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -17,6 +17,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
@@ -63,22 +64,10 @@ static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
   return vminq_u16(res, max);
 }
 
-void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
-                                    uint8_t *dst8, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int width, int height, int bd) {
-  assert(x_step_q4 == 16);
-  assert(width >= 4 && height >= 4);
-  (void)filter_y;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  src -= SUBPEL_TAPS / 2 - 1;
-
+static INLINE void highbd_convolve8_horiz_8tap_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height,
+    int bd) {
   const int16x8_t filter = vld1q_s16(filter_x);
 
   if (width == 4) {
@@ -140,6 +129,138 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[16]) = {
+  0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+};
+
+DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
+  0, 2, 4, 6, 1, 3, 5, 7,
+};
+// clang-format on
+
+static INLINE uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter,
+                                              uint16x8x2_t permute_tbl,
+                                              uint16x4_t max) {
+  int16x8_t permuted_samples0 = aom_tbl_s16(s, permute_tbl.val[0]);
+  int16x8_t permuted_samples1 = aom_tbl_s16(s, permute_tbl.val[1]);
+
+  int64x2_t sum0 =
+      aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples0, filter, 0);
+  int64x2_t sum1 =
+      aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples1, filter, 0);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+  uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter,
+                                              uint16x8_t idx, uint16x8_t max) {
+  int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+  int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+  int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+
+  res = aom_tbl_u16(res, idx);
+
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_horiz_4tap_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height,
+    int bd) {
+  const int16x8_t filter = vcombine_s16(vld1_s16(filter_x + 2), vdup_n_s16(0));
+
+  if (width == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
+
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 = highbd_convolve4_4_h(s0, filter, permute_tbl, max);
+      uint16x4_t d1 = highbd_convolve4_4_h(s1, filter, permute_tbl, max);
+      uint16x4_t d2 = highbd_convolve4_4_h(s2, filter, permute_tbl, max);
+      uint16x4_t d3 = highbd_convolve4_4_h(s3, filter, permute_tbl, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int w = width;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_convolve4_8_h(s0, filter, idx, max);
+        uint16x8_t d1 = highbd_convolve4_8_h(s1, filter, idx, max);
+        uint16x8_t d2 = highbd_convolve4_8_h(s2, filter, idx, max);
+        uint16x8_t d3 = highbd_convolve4_8_h(s3, filter, idx, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
+                                    uint8_t *dst8, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int width, int height, int bd) {
+  assert(x_step_q4 == 16);
+  assert(width >= 4 && height >= 4);
+  (void)filter_y;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  if (get_filter_taps_convolve8(filter_x) <= 4) {
+    highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride,
+                                    filter_x, width, height, bd);
+  } else {
+    highbd_convolve8_horiz_8tap_sve(src, src_stride, dst, dst_stride, filter_x,
+                                    width, height, bd);
+  }
+}
+
 DECLARE_ALIGNED(16, static const uint8_t, kDotProdTranConcatTbl[32]) = {
   0, 1, 8,  9,  16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27,
   4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-- 
GitLab


From 8398313499003afa5129e0e367ccc36fc33cab7f Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 4 Mar 2024 11:41:27 +0000
Subject: [PATCH 005/391] Optimize transpose functions in
 aom_highbd_convolve8_vert_sve

Use ZIP instructions instead of TBL to transpose and concatenate
elements in the SVE implementation of aom_highbd_convolve8_vert. This
removed the need to load a table and gives up to 10% uplift.

Change-Id: I92ad082512f263393cb5def409f8dccbb5278016
---
 aom_dsp/arm/highbd_convolve8_sve.c | 80 ++++++++++++------------------
 1 file changed, 33 insertions(+), 47 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 189d11b14d..9830b7e5d8 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -261,11 +261,6 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-DECLARE_ALIGNED(16, static const uint8_t, kDotProdTranConcatTbl[32]) = {
-  0, 1, 8,  9,  16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27,
-  4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   // Shift left and insert new last column in transposed 4x4 block.
   2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25,
@@ -277,8 +272,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
 
 static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
                                         int16x4_t s2, int16x4_t s3,
-                                        int16x8_t res[2],
-                                        uint8x16_t permute_tbl[2]) {
+                                        int16x8_t res[2]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
   // s0: 00, 01, 02, 03
   // s1: 10, 11, 12, 13
@@ -287,22 +281,24 @@ static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
   //
   // res[0]: 00 10 20 30 01 11 21 31
   // res[1]: 02 12 22 32 03 13 23 33
-  //
-  // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
-  // as an argument is preferable to loading it directly from memory as this
-  // inline helper is called many times from the same parent function.
 
-  int8x16x2_t samples = { vreinterpretq_s8_s16(vcombine_s16(s0, s1)),
-                          vreinterpretq_s8_s16(vcombine_s16(s2, s3)) };
+  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+  int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+  int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
 
-  res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[0]));
-  res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[1]));
+  int32x4x2_t s0123 = vzipq_s32(s01, s23);
+
+  res[0] = vreinterpretq_s16_s32(s0123.val[0]);
+  res[1] = vreinterpretq_s16_s32(s0123.val[1]);
 }
 
 static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
                                         int16x8_t s2, int16x8_t s3,
-                                        int16x8_t res[4],
-                                        uint8x16_t permute_tbl[2]) {
+                                        int16x8_t res[4]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
   // s0: 00, 01, 02, 03, 04, 05, 06, 07
   // s1: 10, 11, 12, 13, 14, 15, 16, 17
@@ -313,26 +309,19 @@ static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
   // res_lo[1]: 02 12 22 32 03 13 23 33
   // res_hi[0]: 04 14 24 34 05 15 25 35
   // res_hi[1]: 06 16 26 36 07 17 27 37
-  //
-  // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
-  // as an argument is preferable to loading it directly from memory as this
-  // inline helper is called many times from the same parent function.
-
-  int8x16x2_t samples_lo = {
-    vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s0), vget_low_s16(s1))),
-    vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s2), vget_low_s16(s3)))
-  };
 
-  res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[0]));
-  res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[1]));
+  int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
+  int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
 
-  int8x16x2_t samples_hi = {
-    vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s0), vget_high_s16(s1))),
-    vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s2), vget_high_s16(s3)))
-  };
+  int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
+                                  vreinterpretq_s32_s16(tr23_16.val[0]));
+  int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
+                                  vreinterpretq_s32_s16(tr23_16.val[1]));
 
-  res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[0]));
-  res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[1]));
+  res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
+  res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
+  res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
+  res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
 }
 
 static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
@@ -427,9 +416,6 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
 
   const int16x8_t y_filter = vld1q_s16(filter_y);
 
-  uint8x16_t tran_concat_tbl[2];
-  tran_concat_tbl[0] = vld1q_u8(kDotProdTranConcatTbl);
-  tran_concat_tbl[1] = vld1q_u8(kDotProdTranConcatTbl + 16);
   uint8x16_t merge_block_tbl[3];
   merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
   merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
@@ -446,10 +432,10 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
     // This operation combines a conventional transpose and the sample permute
     // required before computing the dot product.
     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
-    transpose_concat_4x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
 
     do {
       int16x4_t s7, s8, s9, s10;
@@ -458,7 +444,7 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
       int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];
 
       // Transpose and shuffle the 4 lines that were loaded.
-      transpose_concat_4x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, s78910);
 
       // Merge new data into block from previous iteration.
       aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
@@ -501,10 +487,10 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
       // This operation combines a conventional transpose and the sample permute
       // required before computing the dot product.
       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
-      transpose_concat_8x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
 
       do {
         int16x8_t s7, s8, s9, s10;
@@ -513,7 +499,7 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
         int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
 
         // Transpose and shuffle the 4 lines that were loaded.
-        transpose_concat_8x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
+        transpose_concat_8x4(s7, s8, s9, s10, s78910);
 
         // Merge new data into block from previous iteration.
         aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
-- 
GitLab


From cd0a8b7cc087bd02227d4af7001972ef244e3988 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 23 Feb 2024 17:57:08 +0000
Subject: [PATCH 006/391] Add 4-tap specialization of
 aom_highbd_convolve8_vert_sve

This function is run mostly with 2-tap and 4-tap filters, so add a
specialised path that allows to use half as many instructions as the
generic 8-tap path for these filters.

Change-Id: Ib273a1b8906541946edc9c11420319a1a66fe390
---
 aom_dsp/arm/highbd_convolve8_sve.c | 166 ++++++++++++++++++++++++++---
 1 file changed, 150 insertions(+), 16 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 9830b7e5d8..e57c41a0b0 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -398,22 +398,10 @@ static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
   return vminq_u16(res, max);
 }
 
-void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int width, int height, int bd) {
-  assert(y_step_q4 == 16);
-  assert(w >= 4 && h >= 4);
-  (void)filter_x;
-  (void)y_step_q4;
-  (void)x_step_q4;
-
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
-
+static INLINE void highbd_convolve8_vert_8tap_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
+    int bd) {
   const int16x8_t y_filter = vld1q_s16(filter_y);
 
   uint8x16_t merge_block_tbl[3];
@@ -545,3 +533,149 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
     } while (width != 0);
   }
 }
+
+static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter,
+                                              uint16x4_t max) {
+  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter,
+                                              uint16x8_t max) {
+  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+  int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+  int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+  int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS),
+                                vqrshrun_n_s32(s4567, FILTER_BITS));
+
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_vert_4tap_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
+    int bd) {
+  const int16x8_t y_filter =
+      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
+
+  uint8x16_t merge_block_tbl[3];
+  merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
+  merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
+  merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
+
+  if (width == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    int16_t *s = (int16_t *)src;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      // This operation combines a conventional transpose and the sample permute
+      // required before computing the dot product.
+      int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+      transpose_concat_4x4(s0, s1, s2, s3, s0123);
+      transpose_concat_4x4(s1, s2, s3, s4, s1234);
+      transpose_concat_4x4(s2, s3, s4, s5, s2345);
+      transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+      uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max);
+      uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max);
+      uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max);
+      uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      // Shuffle everything up four rows.
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    do {
+      int h = height;
+      int16_t *s = (int16_t *)src;
+      uint16_t *d = dst;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        // This operation combines a conventional transpose and the sample
+        // permute required before computing the dot product.
+        int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+        transpose_concat_8x4(s0, s1, s2, s3, s0123);
+        transpose_concat_8x4(s1, s2, s3, s4, s1234);
+        transpose_concat_8x4(s2, s3, s4, s5, s2345);
+        transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+        uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max);
+        uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max);
+        uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max);
+        uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Shuffle everything up four rows.
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+      src += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst8, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int width, int height, int bd) {
+  assert(y_step_q4 == 16);
+  assert(w >= 4 && h >= 4);
+  (void)filter_x;
+  (void)y_step_q4;
+  (void)x_step_q4;
+
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst,
+                                   dst_stride, filter_y, width, height, bd);
+  } else {
+    highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y,
+                                   width, height, bd);
+  }
+}
-- 
GitLab


From 69975b155791efa10f7b1dd01e28d9c57360a26d Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 15 Mar 2024 15:37:39 +0000
Subject: [PATCH 007/391] Fix test sizes in av1_quantize_lp unit tests

av1_quantize_lp is only ever used with block sizes 4x4, 8x8 and 16x16,
which are the sizes that are already used by the SSE2 tests. Change the
AVX2 and Neon tests to reflect that as well.

Change-Id: Idb3e9f2f36138741838f9e2b6f51c3f4c4f0882f
---
 test/quantize_func_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 328d5b10df..cbcaba1c5e 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -482,9 +482,9 @@ const QuantizeParam<LPQuantizeFunc> kLPQParamArrayAvx2[] = {
   make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
-             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+             static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
-             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+             static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, LowPrecisionQuantizeTest,
@@ -704,9 +704,9 @@ const QuantizeParam<LPQuantizeFunc> kLPQParamArrayNEON[] = {
   make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
-             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+             static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8),
   make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
-             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+             static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, LowPrecisionQuantizeTest,
-- 
GitLab


From efb0d985ad2edcfa04e6c92b2a1e2f45952cf6f3 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 19 Mar 2024 10:11:17 -0700
Subject: [PATCH 008/391] Add assert to set_active_map

Add assert that mi_row and mi_col should
both be above 0.

Bug: aomedia:3549
Change-Id: I80fa4115a786cbf35f7d6af0f92d3839c94767c7
---
 av1/encoder/encoder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 88862de69d..411eb7000b 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -155,8 +155,8 @@ int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
     const int mi_cols = mi_params->mi_cols;
     cpi->active_map.update = 0;
     cpi->rc.percent_blocks_inactive = 0;
-    assert(mi_rows % 2 == 0);
-    assert(mi_cols % 2 == 0);
+    assert(mi_rows % 2 == 0 && mi_rows > 0);
+    assert(mi_cols % 2 == 0 && mi_cols > 0);
     if (new_map_16x16) {
       int num_samples = 0;
       int num_blocks_inactive = 0;
-- 
GitLab


From 37108eba4b2336d88cd0c4c5f5170f4ff8213a28 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 19 Mar 2024 15:14:33 -0700
Subject: [PATCH 009/391] Fix spelling in comment

percentge - > percentage

Change-Id: I8eff543e36d7cd725b085e036e009372755c3459
---
 av1/encoder/ratectrl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 0dd8e32b78..5121a909f4 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -249,7 +249,7 @@ typedef struct {
   // signals if number of blocks with motion is high
   int percent_blocks_with_motion;
 
-  // signals percentge of 16x16 blocks that are inactive, via active_maps
+  // signals percentage of 16x16 blocks that are inactive, via active_maps
   int percent_blocks_inactive;
 
   // Maximum value of source sad across all blocks of frame.
-- 
GitLab


From 70c64d70437ac49fef461c1ff8b1c7a090acc4d0 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sat, 16 Mar 2024 23:45:34 -0700
Subject: [PATCH 010/391] rtc: Incorporate active_maps into the cyclic refresh

Incorporate the active_maps into the cyclic_refresh
to better target active areas, and reduce some compuations.

1) set the active_map first, followed by the cyclic_refresh.
2) set the percent_refresh based on active_region
3) enable/disable active_maps and cyclic_refresh based
   on perc_active_blocks
4) only enter cyclic_refresh_update_segment() for blocks
   labelled as active

This gives some speedup, ~2-3% observed in offline
test, is not bitexact, but makes the refresh more
targeted to active areas.

Change-Id: I79b7d13dd5370e57f0efbf6c51a4a50d2705b232
---
 av1/encoder/aq_cyclicrefresh.c  | 44 ++++++++++++++++++++++++++-------
 av1/encoder/encodeframe_utils.c |  2 ++
 av1/encoder/encoder.c           |  6 +----
 av1/encoder/encoder_utils.c     | 15 ++++-------
 4 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index d2fc0f7bb7..11b6ea629b 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -15,6 +15,7 @@
 #include "av1/common/pred_common.h"
 #include "av1/common/seg_common.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
@@ -295,6 +296,7 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   unsigned char *const seg_map = cpi->enc_seg.map;
+  unsigned char *const active_map_4x4 = cpi->active_map.map;
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   uint64_t sb_sad = 0;
@@ -302,7 +304,12 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   uint64_t thresh_sad = INT64_MAX;
   const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols;
   const int mi_stride = mi_cols;
-  memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+  // Don't set seg_map to 0 if active_maps is enabled. Active_maps will set
+  // seg_map to either 7 or 0 (AM_SEGMENT_ID_INACTIVE/ACTIVE), and cyclic
+  // refresh set below (segment 1 or 2) will only be set for ACTIVE blocks.
+  if (!cpi->active_map.enabled) {
+    memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+  }
   sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
   sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
@@ -357,7 +364,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than low motion.
         // If the block_sad (sb_sad) is very low label it for refresh anyway.
-        if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) {
+        // If active_maps is enabled, only allow for setting on ACTIVE blocks.
+        if ((cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) &&
+            (!cpi->active_map.enabled ||
+             active_map_4x4[bl_index2] == AM_SEGMENT_ID_ACTIVE)) {
           sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
@@ -380,7 +390,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   cr->sb_index = i;
   if (cr->target_num_seg_blocks == 0) {
     // Disable segmentation, seg_map is already set to 0 above.
-    av1_disable_segmentation(&cm->seg);
+    // Don't disable if active_map is being used.
+    if (!cpi->active_map.enabled) av1_disable_segmentation(&cm->seg);
   }
 }
 
@@ -448,6 +459,15 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   else
     cr->percent_refresh = 10 + cr->percent_refresh_adjustment;
 
+  if (cpi->active_map.enabled) {
+    // Scale down the percent_refresh to target the active blocks only.
+    cr->percent_refresh =
+        cr->percent_refresh * (100 - cpi->rc.percent_blocks_inactive) / 100;
+    if (cr->percent_refresh == 0) {
+      cr->apply_cyclic_refresh = 0;
+    }
+  }
+
   cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->use_block_sad_scene_det =
@@ -541,10 +561,14 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
 
   if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
   if (!cr->apply_cyclic_refresh) {
-    // Set segmentation map to 0 and disable.
-    unsigned char *const seg_map = cpi->enc_seg.map;
-    memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-    av1_disable_segmentation(&cm->seg);
+    // Don't disable and set seg_map to 0 if active_maps is enabled, unless
+    // whole frame is set as inactive (since we only apply cyclic_refresh to
+    // active blocks).
+    if (!cpi->active_map.enabled || cpi->rc.percent_blocks_inactive == 100) {
+      unsigned char *const seg_map = cpi->enc_seg.map;
+      memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+      av1_disable_segmentation(&cm->seg);
+    }
     if (frame_is_intra_only(cm) || scene_change_detected ||
         cpi->ppi->rtc_ref.bias_recovery_frame) {
       cr->sb_index = 0;
@@ -572,9 +596,11 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
       cr->thresh_rate_sb = INT64_MAX;
     }
     // Set up segmentation.
-    // Clear down the segment map.
     av1_enable_segmentation(&cm->seg);
-    av1_clearall_segfeatures(seg);
+    if (!cpi->active_map.enabled) {
+      // Clear down the segment map, only if active_maps is not enabled.
+      av1_clearall_segfeatures(seg);
+    }
 
     // Note: setting temporal_update has no effect, as the seg-map coding method
     // (temporal or spatial) is determined in
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 947434c7e7..a8e4a88396 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -15,6 +15,7 @@
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/rdopt.h"
 
 void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
@@ -306,6 +307,7 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        mi_addr->segment_id != AM_SEGMENT_ID_INACTIVE &&
         !cpi->rc.rtc_external_ratectrl) {
       av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize,
                                         ctx->rd_stats.rate, ctx->rd_stats.dist,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 411eb7000b..ed5f92b528 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2653,12 +2653,8 @@ static int encode_without_recode(AV1_COMP *cpi) {
         av1_setup_frame(cpi);
     }
   }
-
-  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
-    suppress_active_map(cpi);
-    av1_cyclic_refresh_setup(cpi);
-  }
   av1_apply_active_map(cpi);
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_setup(cpi);
   if (cm->seg.enabled) {
     if (!cm->seg.update_data && cm->prev_frame) {
       segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 5bf9f20e53..0736e57655 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -421,11 +421,13 @@ void av1_apply_active_map(AV1_COMP *cpi) {
   struct segmentation *const seg = &cpi->common.seg;
   unsigned char *const seg_map = cpi->enc_seg.map;
   const unsigned char *const active_map = cpi->active_map.map;
-  int i;
 
   assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
 
-  if (frame_is_intra_only(&cpi->common)) {
+  // Disable the active_maps on intra_only frames or if the
+  // input map for the current frame has no inactive blocks.
+  if (frame_is_intra_only(&cpi->common) ||
+      cpi->rc.percent_blocks_inactive == 0) {
     cpi->active_map.enabled = 0;
     cpi->active_map.update = 1;
   }
@@ -434,14 +436,7 @@ void av1_apply_active_map(AV1_COMP *cpi) {
     if (cpi->active_map.enabled) {
       const int num_mis =
           cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
-      for (i = 0; i < num_mis; ++i) {
-        // In active region: only unset segmentation map if cyclic refresh is
-        // not set.
-        if (active_map[i] == AM_SEGMENT_ID_INACTIVE ||
-            (seg_map[i] != CR_SEGMENT_ID_BOOST1 &&
-             seg_map[i] != CR_SEGMENT_ID_BOOST2))
-          seg_map[i] = active_map[i];
-      }
+      memcpy(seg_map, active_map, sizeof(active_map[0]) * num_mis);
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
-- 
GitLab


From a8b3d10754b43c1714212d703c3c3d2eb64b464e Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 8 Mar 2024 16:14:25 +0000
Subject: [PATCH 011/391] Add SVE2 impl of HBD dist_wtd_convolve_y for 8-tap
 filters

Add SVE2 implementation of av1_highbd_dist_wtd_convolve_y for 8-tap
filters, as well as the corresponding tests.

The helper functions used to shuffle the input data are shared with the
regular vertical convolutions, so move them to a separate header file.

Change-Id: Iba06f88802cb302209c506ffae4f68b690dbb3fa
---
 .../arm/highbd_compound_convolve_sve2.c       | 249 ++++++++++++++++++
 av1/common/arm/highbd_convolve_sve2.c         |  80 +-----
 av1/common/arm/highbd_convolve_sve2.h         |  97 +++++++
 av1/common/av1_rtcd_defs.pl                   |   2 +-
 test/av1_convolve_test.cc                     |   6 +
 5 files changed, 354 insertions(+), 80 deletions(-)
 create mode 100644 av1/common/arm/highbd_convolve_sve2.h

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index dc983c5f8b..f500110f16 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -25,6 +25,7 @@
 #include "av1/common/filter.h"
 #include "av1/common/arm/highbd_compound_convolve_neon.h"
 #include "av1/common/arm/highbd_convolve_neon.h"
+#include "av1/common/arm/highbd_convolve_sve2.h"
 
 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
@@ -436,3 +437,251 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
     }
   }
 }
+
+static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
+                                              int16x8_t samples_hi[2],
+                                              int16x8_t filter,
+                                              int64x2_t offset,
+                                              int32x4_t shift) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  sum0123 = vshlq_s32(sum0123, shift);
+
+  return vqmovun_s32(sum0123);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
+                                              int16x8_t samples_hi[4],
+                                              int16x8_t filter,
+                                              int64x2_t offset,
+                                              int32x4_t shift) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
+  sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
+
+  int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
+  sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  sum0123 = vshlq_s32(sum0123, shift);
+
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+  sum4567 = vshlq_s32(sum4567, shift);
+
+  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+}
+
+static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *y_filter_ptr,
+    ConvolveParams *conv_params, int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int64x2_t offset_s64 = vdupq_n_s64(offset);
+  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+
+  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  uint16x8_t correction0 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
+  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
+  uint16x8_t correction1 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
+
+  uint16x8_t correction2 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
+  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
+
+  if (width == 4) {
+    int16_t *s = (int16_t *)src;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // required before computing the dot product.
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      // Transpose and shuffle the 4 lines that were loaded.
+      transpose_concat_4x4(s7, s8, s9, s10, s789A);
+
+      // Merge new data into block from previous iteration.
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4_y(s0123, s4567, y_filter, offset_s64, shift);
+      uint16x4_t d1 =
+          highbd_convolve8_4_y(s1234, s5678, y_filter, offset_s64, shift);
+      uint16x4_t d2 =
+          highbd_convolve8_4_y(s2345, s6789, y_filter, offset_s64, shift);
+      uint16x4_t d3 =
+          highbd_convolve8_4_y(s3456, s789A, y_filter, offset_s64, shift);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      int h = height;
+      int16_t *s = (int16_t *)src;
+      uint16_t *d = dst;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // required before computing the dot product.
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+        int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
+
+        // Transpose and shuffle the 4 lines that were loaded.
+        transpose_concat_8x4(s7, s8, s9, s10, s789A);
+
+        // Merge new data into block from previous iteration.
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8_y(s0123, s4567, y_filter, offset_s64, shift);
+        uint16x8_t d1 =
+            highbd_convolve8_8_y(s1234, s5678, y_filter, offset_s64, shift);
+        uint16x8_t d2 =
+            highbd_convolve8_8_y(s2345, s6789, y_filter, offset_s64, shift);
+        uint16x8_t d3 =
+            highbd_convolve8_8_y(s3456, s789A, y_filter, offset_s64, shift);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+      src += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_y_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+
+  if (y_filter_taps != 8) {
+    av1_highbd_dist_wtd_convolve_y_neon(src, src_stride, dst, dst_stride, w, h,
+                                        filter_params_y, subpel_y_qn,
+                                        conv_params, bd);
+    return;
+  }
+
+  int dst16_stride = conv_params->dst_stride;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+  const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
+                                (1 << (bd + FILTER_BITS)) +
+                                (1 << (bd + FILTER_BITS - 1));
+
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  src -= vert_offset * src_stride;
+
+  if (conv_params->do_average) {
+    highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride,
+                                         w, h, y_filter_ptr, conv_params,
+                                         round_offset_conv);
+    if (conv_params->use_dist_wtd_comp_avg) {
+      if (bd == 12) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, conv_params);
+      } else {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, bd);
+      }
+    } else {
+      if (bd == 12) {
+        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                conv_params);
+
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, bd);
+      }
+    }
+  } else {
+    highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride,
+                                         w, h, y_filter_ptr, conv_params,
+                                         round_offset_conv);
+  }
+}
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index 1cb1086b5a..82eb12fcea 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -22,6 +22,7 @@
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_sve2.h"
 
 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
@@ -398,85 +399,6 @@ void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride,
                                  x_filter_ptr, conv_params, bd);
 }
 
-// clang-format off
-DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
-  // Shift left and insert new last column in transposed 4x4 block.
-  1, 2, 3, 0, 5, 6, 7, 4,
-  // Shift left and insert two new columns in transposed 4x4 block.
-  2, 3, 0, 1, 6, 7, 4, 5,
-  // Shift left and insert three new columns in transposed 4x4 block.
-  3, 0, 1, 2, 7, 4, 5, 6,
-};
-// clang-format on
-
-static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
-                                        int16x4_t s2, int16x4_t s3,
-                                        int16x8_t res[2]) {
-  // Transpose 16-bit elements and concatenate result rows as follows:
-  // s0: 00, 01, 02, 03
-  // s1: 10, 11, 12, 13
-  // s2: 20, 21, 22, 23
-  // s3: 30, 31, 32, 33
-  //
-  // res[0]: 00 10 20 30 01 11 21 31
-  // res[1]: 02 12 22 32 03 13 23 33
-
-  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
-  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
-  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
-  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
-
-  int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
-  int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
-
-  int32x4x2_t s0123 = vzipq_s32(s01, s23);
-
-  res[0] = vreinterpretq_s16_s32(s0123.val[0]);
-  res[1] = vreinterpretq_s16_s32(s0123.val[1]);
-}
-
-static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
-                                        int16x8_t s2, int16x8_t s3,
-                                        int16x8_t res[4]) {
-  // Transpose 16-bit elements and concatenate result rows as follows:
-  // s0: 00, 01, 02, 03, 04, 05, 06, 07
-  // s1: 10, 11, 12, 13, 14, 15, 16, 17
-  // s2: 20, 21, 22, 23, 24, 25, 26, 27
-  // s3: 30, 31, 32, 33, 34, 35, 36, 37
-  //
-  // res[0]: 00 10 20 30 01 11 21 31
-  // res[1]: 02 12 22 32 03 13 23 33
-  // res[2]: 04 14 24 34 05 15 25 35
-  // res[3]: 06 16 26 36 07 17 27 37
-
-  int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
-  int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
-
-  int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
-                                  vreinterpretq_s32_s16(tr23_16.val[0]));
-  int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
-                                  vreinterpretq_s32_s16(tr23_16.val[1]));
-
-  res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
-  res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
-  res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
-  res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
-}
-
-static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
-                                  uint16x8_t tbl, int16x8_t res[4]) {
-  res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
-  res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
-  res[2] = aom_tbl2_s16(t0[2], t1[2], tbl);
-  res[3] = aom_tbl2_s16(t0[3], t1[3], tbl);
-}
-
-static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
-                                  uint16x8_t tbl, int16x8_t res[2]) {
-  res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
-  res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
-}
-
 static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
                                                int16x8_t s2[2],
                                                int16x8_t filter_0_7,
diff --git a/av1/common/arm/highbd_convolve_sve2.h b/av1/common/arm/highbd_convolve_sve2.h
new file mode 100644
index 0000000000..05e23deef4
--- /dev/null
+++ b/av1/common/arm/highbd_convolve_sve2.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_
+#define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/aom_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 0, 5, 6, 7, 4,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 0, 1, 6, 7, 4, 5,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
+                                        int16x4_t s2, int16x4_t s3,
+                                        int16x8_t res[2]) {
+  // Transpose 16-bit elements and concatenate result rows as follows:
+  // s0: 00, 01, 02, 03
+  // s1: 10, 11, 12, 13
+  // s2: 20, 21, 22, 23
+  // s3: 30, 31, 32, 33
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+
+  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+  int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+  int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
+
+  int32x4x2_t s0123 = vzipq_s32(s01, s23);
+
+  res[0] = vreinterpretq_s16_s32(s0123.val[0]);
+  res[1] = vreinterpretq_s16_s32(s0123.val[1]);
+}
+
+static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
+                                        int16x8_t s2, int16x8_t s3,
+                                        int16x8_t res[4]) {
+  // Transpose 16-bit elements and concatenate result rows as follows:
+  // s0: 00, 01, 02, 03, 04, 05, 06, 07
+  // s1: 10, 11, 12, 13, 14, 15, 16, 17
+  // s2: 20, 21, 22, 23, 24, 25, 26, 27
+  // s3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+  // res[2]: 04 14 24 34 05 15 25 35
+  // res[3]: 06 16 26 36 07 17 27 37
+
+  int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
+  int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
+  int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
+                                  vreinterpretq_s32_s16(tr23_16.val[0]));
+  int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
+                                  vreinterpretq_s32_s16(tr23_16.val[1]));
+
+  res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
+  res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
+  res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
+  res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
+}
+
+static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
+                                  uint16x8_t tbl, int16x8_t res[4]) {
+  res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
+  res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
+  res[2] = aom_tbl2_s16(t0[2], t1[2], tbl);
+  res[3] = aom_tbl2_s16(t0[3], t1[3], tbl);
+}
+
+static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
+                                  uint16x8_t tbl, int16x8_t res[2]) {
+  res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
+  res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
+}
+
+#endif  // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 79107c6fb0..9113e44577 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -608,7 +608,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
     specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon sve2/;
-    specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/;
+    specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon sve2/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
     specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon sve2/;
     specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 12997dbfed..40a71667c8 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -2044,6 +2044,12 @@ INSTANTIATE_TEST_SUITE_P(
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_neon));
 #endif
 
+#if HAVE_SVE2
+INSTANTIATE_TEST_SUITE_P(
+    SVE2, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sve2));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 //////////////////////////////////////////////////////
-- 
GitLab


From 5da47ec716e9c320afab896b6f41bfd94da7914c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 11 Mar 2024 14:10:18 +0000
Subject: [PATCH 012/391] Specialize 8-tap HBD dist_wtd_convolve_y_sve2 on
 bitdepth

The rounding value at the end of the convolution depends on the bitdepth
(8/10, or 12). Add 2 specialized versions of the function, so that we
know the rounding value at compile time and therefore use only one
instruction instead of two to perform the final rounding and narrowing
step. This gives up to 10% uplift over the non-specialized version.

Change-Id: Id30c3ba63567ff92ff0a79aaf94c091e0d807f02
---
 .../arm/highbd_compound_convolve_sve2.c       | 261 ++++++++++++++----
 1 file changed, 212 insertions(+), 49 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index f500110f16..d9ea83d551 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -438,11 +438,186 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
   }
 }
 
+static INLINE uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2],
+                                                 int16x8_t samples_hi[2],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4],
+                                                 int16x8_t samples_hi[4],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
+  sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
+
+  int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
+  sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2),
+                      vqrshrun_n_s32(sum4567, ROUND0_BITS + 2));
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *y_filter_ptr) {
+  const int64x2_t offset =
+      vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1)));
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  uint16x8_t correction0 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
+  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
+  uint16x8_t correction1 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
+
+  uint16x8_t correction2 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
+  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
+
+  if (width == 4) {
+    int16_t *s = (int16_t *)src;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // required before computing the dot product.
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      // Transpose and shuffle the 4 lines that were loaded.
+      transpose_concat_4x4(s7, s8, s9, s10, s789A);
+
+      // Merge new data into block from previous iteration.
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+      uint16x4_t d0 = highbd_12_convolve8_4_y(s0123, s4567, y_filter, offset);
+      uint16x4_t d1 = highbd_12_convolve8_4_y(s1234, s5678, y_filter, offset);
+      uint16x4_t d2 = highbd_12_convolve8_4_y(s2345, s6789, y_filter, offset);
+      uint16x4_t d3 = highbd_12_convolve8_4_y(s3456, s789A, y_filter, offset);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      int h = height;
+      int16_t *s = (int16_t *)src;
+      uint16_t *d = dst;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // required before computing the dot product.
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+        int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
+
+        // Transpose and shuffle the 4 lines that were loaded.
+        transpose_concat_8x4(s7, s8, s9, s10, s789A);
+
+        // Merge new data into block from previous iteration.
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+        uint16x8_t d0 = highbd_12_convolve8_8_y(s0123, s4567, y_filter, offset);
+        uint16x8_t d1 = highbd_12_convolve8_8_y(s1234, s5678, y_filter, offset);
+        uint16x8_t d2 = highbd_12_convolve8_8_y(s2345, s6789, y_filter, offset);
+        uint16x8_t d3 = highbd_12_convolve8_8_y(s3456, s789A, y_filter, offset);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+      src += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
 static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
                                               int16x8_t samples_hi[2],
                                               int16x8_t filter,
-                                              int64x2_t offset,
-                                              int32x4_t shift) {
+                                              int64x2_t offset) {
   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
 
@@ -450,16 +625,14 @@ static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
   sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
 
   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  sum0123 = vshlq_s32(sum0123, shift);
 
-  return vqmovun_s32(sum0123);
+  return vqrshrun_n_s32(sum0123, ROUND0_BITS);
 }
 
 static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
                                               int16x8_t samples_hi[4],
                                               int16x8_t filter,
-                                              int64x2_t offset,
-                                              int32x4_t shift) {
+                                              int64x2_t offset) {
   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
 
@@ -473,21 +646,18 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
   sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
 
   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  sum0123 = vshlq_s32(sum0123, shift);
-
   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
-  sum4567 = vshlq_s32(sum4567, shift);
 
-  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+  return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS),
+                      vqrshrun_n_s32(sum4567, ROUND0_BITS));
 }
 
 static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    int width, int height, const int16_t *y_filter_ptr,
-    ConvolveParams *conv_params, int offset) {
+    int width, int height, const int16_t *y_filter_ptr, const int bd) {
+  const int64x2_t offset =
+      vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)));
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const int64x2_t offset_s64 = vdupq_n_s64(offset);
-  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
 
   uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
   // Scale indices by size of the true vector length to avoid reading from an
@@ -530,14 +700,10 @@ static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2(
       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
 
-      uint16x4_t d0 =
-          highbd_convolve8_4_y(s0123, s4567, y_filter, offset_s64, shift);
-      uint16x4_t d1 =
-          highbd_convolve8_4_y(s1234, s5678, y_filter, offset_s64, shift);
-      uint16x4_t d2 =
-          highbd_convolve8_4_y(s2345, s6789, y_filter, offset_s64, shift);
-      uint16x4_t d3 =
-          highbd_convolve8_4_y(s3456, s789A, y_filter, offset_s64, shift);
+      uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, offset);
+      uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, offset);
+      uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, offset);
+      uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, offset);
 
       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
@@ -587,14 +753,10 @@ static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2(
         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
 
-        uint16x8_t d0 =
-            highbd_convolve8_8_y(s0123, s4567, y_filter, offset_s64, shift);
-        uint16x8_t d1 =
-            highbd_convolve8_8_y(s1234, s5678, y_filter, offset_s64, shift);
-        uint16x8_t d2 =
-            highbd_convolve8_8_y(s2345, s6789, y_filter, offset_s64, shift);
-        uint16x8_t d3 =
-            highbd_convolve8_8_y(s3456, s789A, y_filter, offset_s64, shift);
+        uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, offset);
+        uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, offset);
+        uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, offset);
+        uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, offset);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -648,40 +810,41 @@ void av1_highbd_dist_wtd_convolve_y_sve2(
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = filter_params_y->taps / 2 - 1;
   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
-  const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
-                                (1 << (bd + FILTER_BITS)) +
-                                (1 << (bd + FILTER_BITS - 1));
 
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   src -= vert_offset * src_stride;
 
-  if (conv_params->do_average) {
-    highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride,
-                                         w, h, y_filter_ptr, conv_params,
-                                         round_offset_conv);
-    if (conv_params->use_dist_wtd_comp_avg) {
-      if (bd == 12) {
+  if (bd == 12) {
+    if (conv_params->do_average) {
+      highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block,
+                                              im_stride, w, h, y_filter_ptr);
+      if (conv_params->use_dist_wtd_comp_avg) {
         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
                                          w, h, conv_params);
       } else {
-        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, conv_params, bd);
-      }
-    } else {
-      if (bd == 12) {
         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
                                 conv_params);
-
+      }
+    } else {
+      highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16,
+                                              dst16_stride, w, h, y_filter_ptr);
+    }
+  } else {
+    if (conv_params->do_average) {
+      highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride,
+                                           w, h, y_filter_ptr, bd);
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, bd);
       } else {
         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
                              conv_params, bd);
       }
+    } else {
+      highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride,
+                                           w, h, y_filter_ptr, bd);
     }
-  } else {
-    highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride,
-                                         w, h, y_filter_ptr, conv_params,
-                                         round_offset_conv);
   }
 }
-- 
GitLab


From 80123cb35215ab8a775d75ac4817090803a92d02 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 19 Mar 2024 11:22:47 +0000
Subject: [PATCH 013/391] Add 4-tap filter path for Neon HBD vert compound
 convolution

Add 4-tap filter specialization path for
av1_highbd_dist_wtd_convolve_y_neon, for 12 and 8/10 bitdepth. This
gives up to 30% uplift over using the 6-tap path.

Change-Id: Ic7fc8bc12c184a94d41c799b1d54e1c2befffcea
---
 .../arm/highbd_compound_convolve_neon.c       | 230 +++++++++++++++++-
 1 file changed, 226 insertions(+), 4 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index c93a1d4e28..9247ded6bf 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -711,6 +711,212 @@ static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
   }
 }
 
+static INLINE uint16x4_t highbd_12_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x8_t highbd_12_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_4tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 =
+          highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
+      uint16x4_t d1 =
+          highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
+      uint16x4_t d2 =
+          highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d3 =
+          highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 =
+            highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
+                      vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_y_4tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
+      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
+      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
@@ -899,7 +1105,11 @@ void av1_highbd_dist_wtd_convolve_y_neon(
 
   if (bd == 12) {
     if (conv_params->do_average) {
-      if (y_filter_taps <= 6) {
+      if (y_filter_taps <= 4) {
+        highbd_12_dist_wtd_convolve_y_4tap_neon(
+            src + 2 * src_stride, src_stride, im_block, im_stride, w, h,
+            y_filter_ptr, round_offset_conv);
+      } else if (y_filter_taps == 6) {
         highbd_12_dist_wtd_convolve_y_6tap_neon(
             src + src_stride, src_stride, im_block, im_stride, w, h,
             y_filter_ptr, round_offset_conv);
@@ -916,7 +1126,11 @@ void av1_highbd_dist_wtd_convolve_y_neon(
                                 conv_params);
       }
     } else {
-      if (y_filter_taps <= 6) {
+      if (y_filter_taps <= 4) {
+        highbd_12_dist_wtd_convolve_y_4tap_neon(
+            src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h,
+            y_filter_ptr, round_offset_conv);
+      } else if (y_filter_taps == 6) {
         highbd_12_dist_wtd_convolve_y_6tap_neon(
             src + src_stride, src_stride, dst16, dst16_stride, w, h,
             y_filter_ptr, round_offset_conv);
@@ -928,7 +1142,11 @@ void av1_highbd_dist_wtd_convolve_y_neon(
     }
   } else {
     if (conv_params->do_average) {
-      if (y_filter_taps <= 6) {
+      if (y_filter_taps <= 4) {
+        highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
+                                             im_block, im_stride, w, h,
+                                             y_filter_ptr, round_offset_conv);
+      } else if (y_filter_taps == 6) {
         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
                                              im_block, im_stride, w, h,
                                              y_filter_ptr, round_offset_conv);
@@ -945,7 +1163,11 @@ void av1_highbd_dist_wtd_convolve_y_neon(
                              conv_params, bd);
       }
     } else {
-      if (y_filter_taps <= 6) {
+      if (y_filter_taps <= 4) {
+        highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
+                                             dst16, dst16_stride, w, h,
+                                             y_filter_ptr, round_offset_conv);
+      } else if (y_filter_taps == 6) {
         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
                                              dst16, dst16_stride, w, h,
                                              y_filter_ptr, round_offset_conv);
-- 
GitLab


From eefd5585a0c4c204fcf7d30065f8c2ca35c38a82 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 18 Mar 2024 15:03:01 +0000
Subject: [PATCH 014/391] Add SVE2 impl of HBD dist_wtd_convolve_2d for 8-tap
 filters

Add SVE2 implementation of av1_highbd_dist_wtd_convolve_2d for 8-tap
filters, as well as the corresponding tests. This gives up to 25% uplift
over the Neon implementation.

Change-Id: Ie7d8619e0a924b2da2fc04af16a707caaf8759c7
---
 aom_dsp/arm/mem_neon.h                        |  10 +
 .../arm/highbd_compound_convolve_sve2.c       | 356 ++++++++++++++++++
 av1/common/av1_rtcd_defs.pl                   |   2 +-
 test/av1_convolve_test.cc                     |   6 +
 4 files changed, 373 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index a4e9eb2e72..d749f1def1 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -556,6 +556,16 @@ static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s1);
 }
 
+static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+}
+
 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1,
                                  const uint16x8_t s2, const uint16x8_t s3) {
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index d9ea83d551..dd55e346af 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -848,3 +848,359 @@ void av1_highbd_dist_wtd_convolve_y_sve2(
     }
   }
 }
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_h(int16x8_t s0[8],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset,
+                                                 int32x4_t shift) {
+  int64x2_t sum[8];
+
+  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
+  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
+  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
+  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
+  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
+  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
+  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
+  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+  sum[4] = vpaddq_s64(sum[4], sum[5]);
+  sum[6] = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
+
+  sum0123 = vshlq_s32(sum0123, shift);
+  sum4567 = vshlq_s32(sum4567, shift);
+
+  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr,
+    ConvolveParams *conv_params, const int offset) {
+  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+
+  const int64x2_t offset_lo = vcombine_s64(vcreate_s64(offset), vdup_n_s64(0));
+  const int16x8_t filter = vld1q_s16(x_filter_ptr);
+
+  // We are only doing 8-tap vertical convolutions, therefore we know the
+  // intermediate height will be h + 7, so we can do the loop across the whole
+  // block 4 rows at a time and then process the last 3 rows separately. This
+  // will remain true when 4-tap specialisation is added.
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int w = width;
+
+    do {
+      int16x8_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, filter, offset_lo, shift);
+      uint16x8_t d1 = highbd_convolve8_8_2d_h(s1, filter, offset_lo, shift);
+      uint16x8_t d2 = highbd_convolve8_8_2d_h(s2, filter, offset_lo, shift);
+      uint16x8_t d3 = highbd_convolve8_8_2d_h(s3, filter, offset_lo, shift);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  // Process final 3 rows.
+  const int16_t *s = (const int16_t *)src;
+  do {
+    int16x8_t s0[8], s1[8], s2[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4],
+                 &s0[5], &s0[6], &s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4],
+                 &s1[5], &s1[6], &s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4],
+                 &s2[5], &s2[6], &s2[7]);
+
+    uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, filter, offset_lo, shift);
+    uint16x8_t d1 = highbd_convolve8_8_2d_h(s1, filter, offset_lo, shift);
+    uint16x8_t d2 = highbd_convolve8_8_2d_h(s2, filter, offset_lo, shift);
+
+    store_u16_8x3(dst, dst_stride, d0, d1, d2);
+    s += 8;
+    dst += 8;
+    width -= 8;
+  } while (width != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],
+                                                 int16x8_t samples_hi[2],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  return vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],
+                                                 int16x8_t samples_hi[4],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset) {
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
+  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
+  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
+
+  int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
+  sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
+
+  int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
+  sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  return vcombine_u16(vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum4567, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *y_filter_ptr, int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int64x2_t offset_s64 = vdupq_n_s64(offset);
+
+  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  uint16x8_t correction0 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
+  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
+
+  uint16x8_t correction1 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
+
+  uint16x8_t correction2 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
+  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
+
+  if (width == 4) {
+    int16_t *s = (int16_t *)src;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // required before computing the dot product.
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      // Transpose and shuffle the 4 lines that were loaded.
+      transpose_concat_4x4(s7, s8, s9, s10, s789A);
+
+      // Merge new data into block from previous iteration.
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4_2d_v(s0123, s4567, y_filter, offset_s64);
+      uint16x4_t d1 =
+          highbd_convolve8_4_2d_v(s1234, s5678, y_filter, offset_s64);
+      uint16x4_t d2 =
+          highbd_convolve8_4_2d_v(s2345, s6789, y_filter, offset_s64);
+      uint16x4_t d3 =
+          highbd_convolve8_4_2d_v(s3456, s789A, y_filter, offset_s64);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      int h = height;
+      int16_t *s = (int16_t *)src;
+      uint16_t *d = dst;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // required before computing the dot product.
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+        int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
+
+        // Transpose and shuffle the 4 lines that were loaded.
+        transpose_concat_8x4(s7, s8, s9, s10, s789A);
+
+        // Merge new data into block from previous iteration.
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
+        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8_2d_v(s0123, s4567, y_filter, offset_s64);
+        uint16x8_t d1 =
+            highbd_convolve8_8_2d_v(s1234, s5678, y_filter, offset_s64);
+        uint16x8_t d2 =
+            highbd_convolve8_8_2d_v(s2345, s6789, y_filter, offset_s64);
+        uint16x8_t d3 =
+            highbd_convolve8_8_2d_v(s3456, s789A, y_filter, offset_s64);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+      src += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+
+  if (x_filter_taps != 8 || y_filter_taps != 8) {
+    av1_highbd_dist_wtd_convolve_2d_neon(
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+    return;
+  }
+
+  const int im_h = h + y_filter_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = y_filter_taps / 2 - 1;
+  const int horiz_offset = x_filter_taps / 2 - 1;
+  // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
+  // faster non-rounding non-saturating left shift.
+  const int round_offset_conv_x =
+      (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
+  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset_conv_y = (1 << y_offset_bits);
+
+  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(src_ptr, src_stride, im_block,
+                                              im_stride, w, im_h, x_filter_ptr,
+                                              conv_params, round_offset_conv_x);
+
+  if (conv_params->do_average) {
+    highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2,
+                                               im_stride, w, h, y_filter_ptr,
+                                               round_offset_conv_y);
+    if (conv_params->use_dist_wtd_comp_avg) {
+      if (bd == 12) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
+                                         w, h, conv_params);
+
+      } else {
+        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+                                      h, conv_params, bd);
+      }
+    } else {
+      if (bd == 12) {
+        highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+                                conv_params);
+
+      } else {
+        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+                             conv_params, bd);
+      }
+    }
+  } else {
+    highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, dst16,
+                                               dst16_stride, w, h, y_filter_ptr,
+                                               round_offset_conv_y);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 9113e44577..c0831330d1 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -606,7 +606,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
   if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
+    specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon sve2/;
     specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon sve2/;
     specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon sve2/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 40a71667c8..b2392276cc 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -2464,6 +2464,12 @@ INSTANTIATE_TEST_SUITE_P(
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_neon));
 #endif
 
+#if HAVE_SVE2
+INSTANTIATE_TEST_SUITE_P(
+    SVE2, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sve2));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace
-- 
GitLab


From f5b249fcb44d608906acc86927e3eef193b3be8f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 22 Mar 2024 16:22:37 -0700
Subject: [PATCH 015/391] x86/convolve*: make some functions static

clears some -Wmissing-prototypes warnings

Bug: aomedia:3416
Change-Id: I983c9a05f1f2b301786f856064e9a90e47159c07
---
 av1/common/x86/convolve_2d_avx2.c | 18 ++++++++----------
 av1/common/x86/convolve_2d_sse2.c | 17 ++++++++---------
 av1/common/x86/convolve_sse2.c    | 26 +++++++++++++-------------
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 1b39a0a8d5..d4c1169cc3 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -21,13 +21,11 @@
 
 #include "av1/common/convolve.h"
 
-void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
-                                     uint8_t *dst, int dst_stride, int w, int h,
-                                     const InterpFilterParams *filter_params_x,
-                                     const InterpFilterParams *filter_params_y,
-                                     const int subpel_x_qn,
-                                     const int subpel_y_qn,
-                                     ConvolveParams *conv_params) {
+static void convolve_2d_sr_general_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   if (filter_params_x->taps > 8) {
     const int bd = 8;
     int im_stride = 8, i;
@@ -150,9 +148,9 @@ void av1_convolve_2d_sr_avx2(
 
   const bool use_general = (tap_x == 12 || tap_y == 12);
   if (use_general) {
-    av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
-                                    filter_params_x, filter_params_y,
-                                    subpel_x_q4, subpel_y_q4, conv_params);
+    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_q4,
+                                subpel_y_q4, conv_params);
   } else {
     av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
                                         filter_params_x, filter_params_y,
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 1b85f37294..68971eacc1 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -19,12 +19,11 @@
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "av1/common/convolve.h"
 
-void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params) {
+static void convolve_2d_sr_12tap_sse2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   const int bd = 8;
 
   DECLARE_ALIGNED(16, int16_t,
@@ -231,9 +230,9 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                            filter_params_x, filter_params_y, subpel_x_qn,
                            subpel_y_qn, conv_params);
     } else {
-      av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
-                                    filter_params_x, filter_params_y,
-                                    subpel_x_qn, subpel_y_qn, conv_params);
+      convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params);
     }
   } else {
     const int bd = 8;
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 012e75c1ae..6383567a48 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -75,10 +75,10 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_y,
-                                  int subpel_y_qn) {
+static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_y,
+                                     int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
@@ -185,8 +185,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
                           filter_params_y, subpel_y_qn);
     } else {
-      av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
-                                   filter_params_y, subpel_y_qn);
+      convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_y, subpel_y_qn);
     }
   } else {
     const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -337,11 +337,11 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  int subpel_x_qn,
-                                  ConvolveParams *conv_params) {
+static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     int subpel_x_qn,
+                                     ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_0;
@@ -402,8 +402,8 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
                           filter_params_x, subpel_x_qn, conv_params);
     } else {
-      av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
-                                   filter_params_x, subpel_x_qn, conv_params);
+      convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, subpel_x_qn, conv_params);
     }
   } else {
     const int fo_horiz = filter_params_x->taps / 2 - 1;
-- 
GitLab


From 87a1e9eaa6514ec8a227a263724a879826cc1cbb Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 26 Mar 2024 21:44:29 -0700
Subject: [PATCH 016/391] Fix a clang-tidy misc-include-cleaner warning

no header providing "TX_4X4" is directly included

Change-Id: I507b2c937eef405a7cab73cd672a074e1e29e0ca
---
 test/quantize_func_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index cbcaba1c5e..61f26ea57f 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -19,6 +19,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_codec.h"
+#include "aom_dsp/txfm_common.h"
 #include "aom_ports/aom_timer.h"
 #include "av1/encoder/encoder.h"
 #include "av1/common/scan.h"
-- 
GitLab


From 9e9534e9c2d82931f20b1b54217f459d13bbb15e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 26 Mar 2024 21:39:08 -0700
Subject: [PATCH 017/391] Fix clang-tidy misc-include-cleaner warnings

no header providing "frame_is_intra_only" is directly included
no header providing "memcpy" is directly included

Change-Id: I906055f8f04a26ca86e54b9d3df12c4b815de292
---
 av1/encoder/encoder_utils.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 0736e57655..1f81a530c9 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -9,8 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <string.h>
+
 #include "aom/aomcx.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
-- 
GitLab


From 8c0cdddac6987bf9fe600d380040bc768e8d6f91 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 18 Mar 2024 16:46:04 +0000
Subject: [PATCH 018/391] Add SVE2 impl of HBD dist_wtd_convolve_2d for 4-tap
 filters

Add SVE2 implementation of av1_highbd_dist_wtd_convolve_2d for 4-tap
filters. Only the horizontal pass makes use of SVE, the vertical pass
stays in Neon. This gives up to 30% uplift over the Neon implementation.

Change-Id: Ie9b8a2dd2ca7195184029b948a57f7468e71c11c
---
 aom_dsp/arm/mem_neon.h                        |  10 +
 .../arm/highbd_compound_convolve_sve2.c       | 305 +++++++++++++++++-
 2 files changed, 298 insertions(+), 17 deletions(-)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index d749f1def1..32a462a186 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -537,6 +537,16 @@ static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s7);
 }
 
+static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+}
+
 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x4_t s0, const uint16x4_t s1,
                                  const uint16x4_t s2, const uint16x4_t s3) {
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index dd55e346af..baffc0edb5 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -887,10 +887,9 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
   const int64x2_t offset_lo = vcombine_s64(vcreate_s64(offset), vdup_n_s64(0));
   const int16x8_t filter = vld1q_s16(x_filter_ptr);
 
-  // We are only doing 8-tap vertical convolutions, therefore we know the
-  // intermediate height will be h + 7, so we can do the loop across the whole
-  // block 4 rows at a time and then process the last 3 rows separately. This
-  // will remain true when 4-tap specialisation is added.
+  // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know
+  // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at
+  // a time and then process the last 3 rows separately.
 
   do {
     const int16_t *s = (const int16_t *)src;
@@ -946,6 +945,153 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
   } while (width != 0);
 }
 
+static INLINE uint16x4_t highbd_convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
+                                                 int64x2_t offset,
+                                                 int32x4_t shift,
+                                                 uint16x8x2_t permute_tbl) {
+  int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
+  int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
+
+  int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
+  int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  sum0123 = vshlq_s32(sum0123, shift);
+
+  return vqmovun_s32(sum0123);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_2d_h(int16x8_t s0[4],
+                                                 int16x8_t filter,
+                                                 int64x2_t offset,
+                                                 int32x4_t shift,
+                                                 uint16x8_t tbl) {
+  int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
+  int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
+  int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
+  int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
+
+  int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  sum0415 = vshlq_s32(sum0415, shift);
+
+  int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+  sum2637 = vshlq_s32(sum2637, shift);
+
+  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0415), vqmovun_s32(sum2637));
+  return aom_tbl_u16(res, tbl);
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr,
+    ConvolveParams *conv_params, const int offset) {
+  const int64x2_t offset_vec = vdupq_n_s64(offset);
+  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+  const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
+
+  // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know
+  // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at
+  // a time and then process the last 3 rows separately.
+
+  if (width == 4) {
+    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
+
+    const int16_t *s = (const int16_t *)(src);
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_2d_h(s0, filter, offset_vec, shift, permute_tbl);
+      uint16x4_t d1 =
+          highbd_convolve4_4_2d_h(s1, filter, offset_vec, shift, permute_tbl);
+      uint16x4_t d2 =
+          highbd_convolve4_4_2d_h(s2, filter, offset_vec, shift, permute_tbl);
+      uint16x4_t d3 =
+          highbd_convolve4_4_2d_h(s3, filter, offset_vec, shift, permute_tbl);
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    // Process final 3 rows.
+    int16x8_t s0, s1, s2;
+    load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+    uint16x4_t d0 =
+        highbd_convolve4_4_2d_h(s0, filter, offset_vec, shift, permute_tbl);
+    uint16x4_t d1 =
+        highbd_convolve4_4_2d_h(s1, filter, offset_vec, shift, permute_tbl);
+    uint16x4_t d2 =
+        highbd_convolve4_4_2d_h(s2, filter, offset_vec, shift, permute_tbl);
+
+    store_u16_4x3(dst, dst_stride, d0, d1, d2);
+
+  } else {
+    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
+
+    do {
+      const int16_t *s = (const int16_t *)(src);
+      uint16_t *d = dst;
+      int w = width;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8_2d_h(s0, filter, offset_vec, shift, idx);
+        uint16x8_t d1 =
+            highbd_convolve4_8_2d_h(s1, filter, offset_vec, shift, idx);
+        uint16x8_t d2 =
+            highbd_convolve4_8_2d_h(s2, filter, offset_vec, shift, idx);
+        uint16x8_t d3 =
+            highbd_convolve4_8_2d_h(s3, filter, offset_vec, shift, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    // Process final 3 rows.
+    const int16_t *s = (const int16_t *)(src);
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+
+      uint16x8_t d0 =
+          highbd_convolve4_8_2d_h(s0, filter, offset_vec, shift, idx);
+      uint16x8_t d1 =
+          highbd_convolve4_8_2d_h(s1, filter, offset_vec, shift, idx);
+      uint16x8_t d2 =
+          highbd_convolve4_8_2d_h(s2, filter, offset_vec, shift, idx);
+
+      store_u16_8x3(dst, dst_stride, d0, d1, d2);
+
+      s += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
 static INLINE uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],
                                                  int16x8_t samples_hi[2],
                                                  int16x8_t filter,
@@ -1130,6 +1276,111 @@ static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
   }
 }
 
+static INLINE uint16x4_t highbd_convolve4_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_4tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_2d_v(s0, s1, s2, s3, y_filter, offset_vec);
+      uint16x4_t d1 =
+          highbd_convolve4_4_2d_v(s1, s2, s3, s4, y_filter, offset_vec);
+      uint16x4_t d2 =
+          highbd_convolve4_4_2d_v(s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d3 =
+          highbd_convolve4_4_2d_v(s3, s4, s5, s6, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8_2d_v(s0, s1, s2, s3, y_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_convolve4_8_2d_v(s1, s2, s3, s4, y_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_convolve4_8_2d_v(s2, s3, s4, s5, y_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_convolve4_8_2d_v(s3, s4, s5, s6, y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 void av1_highbd_dist_wtd_convolve_2d_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
@@ -1143,20 +1394,22 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps;
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
 
-  if (x_filter_taps != 8 || y_filter_taps != 8) {
+  if (x_filter_taps == 6 || y_filter_taps == 6) {
     av1_highbd_dist_wtd_convolve_2d_neon(
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
     return;
   }
 
-  const int im_h = h + y_filter_taps - 1;
+  const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = y_filter_taps / 2 - 1;
-  const int horiz_offset = x_filter_taps / 2 - 1;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = clamped_x_taps / 2 - 1;
   // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
   // faster non-rounding non-saturating left shift.
   const int round_offset_conv_x =
@@ -1171,14 +1424,26 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(src_ptr, src_stride, im_block,
-                                              im_stride, w, im_h, x_filter_ptr,
-                                              conv_params, round_offset_conv_x);
+  if (x_filter_taps <= 4) {
+    highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
+        src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+        conv_params, round_offset_conv_x);
+  } else {
+    highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
+        src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+        conv_params, round_offset_conv_x);
+  }
 
   if (conv_params->do_average) {
-    highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2,
-                                               im_stride, w, h, y_filter_ptr,
-                                               round_offset_conv_y);
+    if (y_filter_taps <= 4) {
+      highbd_dist_wtd_convolve_2d_vert_4tap_neon(im_block, im_stride, im_block2,
+                                                 im_stride, w, h, y_filter_ptr,
+                                                 round_offset_conv_y);
+    } else {
+      highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2,
+                                                 im_stride, w, h, y_filter_ptr,
+                                                 round_offset_conv_y);
+    }
     if (conv_params->use_dist_wtd_comp_avg) {
       if (bd == 12) {
         highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
@@ -1199,8 +1464,14 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
       }
     }
   } else {
-    highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, dst16,
-                                               dst16_stride, w, h, y_filter_ptr,
-                                               round_offset_conv_y);
+    if (y_filter_taps <= 4) {
+      highbd_dist_wtd_convolve_2d_vert_4tap_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+          round_offset_conv_y);
+    } else {
+      highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
+          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+          round_offset_conv_y);
+    }
   }
 }
-- 
GitLab


From 636add45516729ae9d7b86c5f1b78cc9326c0c5f Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 18 Mar 2024 17:38:33 +0000
Subject: [PATCH 019/391] Specialise 8-tap Neon HBD 2D dist_wtd_convolve on
 bitdepth

Add a 12-bit specialised path for the 4-tap horizontal pass of
av1_highbd_dist_wtd_convolve_2d_neon, giving up to 10% uplift for the
whole 2D convolution over the non-specialized version.

Change-Id: Id0ac318885c25bcd8d12c32c931d68b2ca595203
---
 .../arm/highbd_compound_convolve_sve2.c       | 160 ++++++++++--------
 1 file changed, 93 insertions(+), 67 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index baffc0edb5..8d618fd345 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -849,42 +849,74 @@ void av1_highbd_dist_wtd_convolve_y_sve2(
   }
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_h(int16x8_t s0[8],
-                                                 int16x8_t filter,
-                                                 int64x2_t offset,
-                                                 int32x4_t shift) {
-  int64x2_t sum[8];
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr) {
+  const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 2));
+  const int16x8_t filter = vld1q_s16(x_filter_ptr);
 
-  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
-  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
-  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
-  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
-  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
-  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
-  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
-  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
+  // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know
+  // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at
+  // a time and then process the last 3 rows separately.
 
-  sum[0] = vpaddq_s64(sum[0], sum[1]);
-  sum[2] = vpaddq_s64(sum[2], sum[3]);
-  sum[4] = vpaddq_s64(sum[4], sum[5]);
-  sum[6] = vpaddq_s64(sum[6], sum[7]);
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int w = width;
 
-  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
-  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
+    do {
+      int16x8_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
 
-  sum0123 = vshlq_s32(sum0123, shift);
-  sum4567 = vshlq_s32(sum4567, shift);
+      uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset);
+      uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset);
+      uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset);
+      uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset);
 
-  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  // Process final 3 rows.
+  const int16_t *s = (const int16_t *)src;
+  do {
+    int16x8_t s0[8], s1[8], s2[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4],
+                 &s0[5], &s0[6], &s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4],
+                 &s1[5], &s1[6], &s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4],
+                 &s2[5], &s2[6], &s2[7]);
+
+    uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset);
+    uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset);
+    uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset);
+
+    store_u16_8x3(dst, dst_stride, d0, d1, d2);
+    s += 8;
+    dst += 8;
+    width -= 8;
+  } while (width != 0);
 }
 
 static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int offset) {
-  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
-
-  const int64x2_t offset_lo = vcombine_s64(vcreate_s64(offset), vdup_n_s64(0));
+    int width, int height, const int16_t *x_filter_ptr, const int bd) {
+  const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 2));
   const int16x8_t filter = vld1q_s16(x_filter_ptr);
 
   // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know
@@ -907,10 +939,10 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
       load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
                    &s3[4], &s3[5], &s3[6], &s3[7]);
 
-      uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, filter, offset_lo, shift);
-      uint16x8_t d1 = highbd_convolve8_8_2d_h(s1, filter, offset_lo, shift);
-      uint16x8_t d2 = highbd_convolve8_8_2d_h(s2, filter, offset_lo, shift);
-      uint16x8_t d3 = highbd_convolve8_8_2d_h(s3, filter, offset_lo, shift);
+      uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset);
+      uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset);
+      uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset);
+      uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset);
 
       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -934,9 +966,9 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
     load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4],
                  &s2[5], &s2[6], &s2[7]);
 
-    uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, filter, offset_lo, shift);
-    uint16x8_t d1 = highbd_convolve8_8_2d_h(s1, filter, offset_lo, shift);
-    uint16x8_t d2 = highbd_convolve8_8_2d_h(s2, filter, offset_lo, shift);
+    uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset);
+    uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset);
+    uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset);
 
     store_u16_8x3(dst, dst_stride, d0, d1, d2);
     s += 8;
@@ -984,8 +1016,9 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_h(int16x8_t s0[4],
 static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int offset) {
-  const int64x2_t offset_vec = vdupq_n_s64(offset);
+    ConvolveParams *conv_params, const int bd) {
+  const int64x2_t offset = vdupq_n_s64((1 << (bd + FILTER_BITS - 1)) +
+                                       (1 << (conv_params->round_0 - 1)));
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
   const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
@@ -1004,13 +1037,13 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
       load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
 
       uint16x4_t d0 =
-          highbd_convolve4_4_2d_h(s0, filter, offset_vec, shift, permute_tbl);
+          highbd_convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
       uint16x4_t d1 =
-          highbd_convolve4_4_2d_h(s1, filter, offset_vec, shift, permute_tbl);
+          highbd_convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
       uint16x4_t d2 =
-          highbd_convolve4_4_2d_h(s2, filter, offset_vec, shift, permute_tbl);
+          highbd_convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
       uint16x4_t d3 =
-          highbd_convolve4_4_2d_h(s3, filter, offset_vec, shift, permute_tbl);
+          highbd_convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl);
 
       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
@@ -1024,11 +1057,11 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
     load_s16_8x3(s, src_stride, &s0, &s1, &s2);
 
     uint16x4_t d0 =
-        highbd_convolve4_4_2d_h(s0, filter, offset_vec, shift, permute_tbl);
+        highbd_convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
     uint16x4_t d1 =
-        highbd_convolve4_4_2d_h(s1, filter, offset_vec, shift, permute_tbl);
+        highbd_convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
     uint16x4_t d2 =
-        highbd_convolve4_4_2d_h(s2, filter, offset_vec, shift, permute_tbl);
+        highbd_convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
 
     store_u16_4x3(dst, dst_stride, d0, d1, d2);
 
@@ -1047,14 +1080,10 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
         load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
         load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
 
-        uint16x8_t d0 =
-            highbd_convolve4_8_2d_h(s0, filter, offset_vec, shift, idx);
-        uint16x8_t d1 =
-            highbd_convolve4_8_2d_h(s1, filter, offset_vec, shift, idx);
-        uint16x8_t d2 =
-            highbd_convolve4_8_2d_h(s2, filter, offset_vec, shift, idx);
-        uint16x8_t d3 =
-            highbd_convolve4_8_2d_h(s3, filter, offset_vec, shift, idx);
+        uint16x8_t d0 = highbd_convolve4_8_2d_h(s0, filter, offset, shift, idx);
+        uint16x8_t d1 = highbd_convolve4_8_2d_h(s1, filter, offset, shift, idx);
+        uint16x8_t d2 = highbd_convolve4_8_2d_h(s2, filter, offset, shift, idx);
+        uint16x8_t d3 = highbd_convolve4_8_2d_h(s3, filter, offset, shift, idx);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1076,12 +1105,9 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
       load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
       load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
 
-      uint16x8_t d0 =
-          highbd_convolve4_8_2d_h(s0, filter, offset_vec, shift, idx);
-      uint16x8_t d1 =
-          highbd_convolve4_8_2d_h(s1, filter, offset_vec, shift, idx);
-      uint16x8_t d2 =
-          highbd_convolve4_8_2d_h(s2, filter, offset_vec, shift, idx);
+      uint16x8_t d0 = highbd_convolve4_8_2d_h(s0, filter, offset, shift, idx);
+      uint16x8_t d1 = highbd_convolve4_8_2d_h(s1, filter, offset, shift, idx);
+      uint16x8_t d2 = highbd_convolve4_8_2d_h(s2, filter, offset, shift, idx);
 
       store_u16_8x3(dst, dst_stride, d0, d1, d2);
 
@@ -1410,10 +1436,6 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = clamped_y_taps / 2 - 1;
   const int horiz_offset = clamped_x_taps / 2 - 1;
-  // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
-  // faster non-rounding non-saturating left shift.
-  const int round_offset_conv_x =
-      (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int round_offset_conv_y = (1 << y_offset_bits);
 
@@ -1425,13 +1447,17 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   if (x_filter_taps <= 4) {
-    highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
-        src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
-        conv_params, round_offset_conv_x);
+    highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(src_ptr, src_stride, im_block,
+                                                im_stride, w, im_h,
+                                                x_filter_ptr, conv_params, bd);
   } else {
-    highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
-        src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
-        conv_params, round_offset_conv_x);
+    if (bd == 12) {
+      highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr);
+    } else {
+      highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd);
+    }
   }
 
   if (conv_params->do_average) {
-- 
GitLab


From b218391386aded8e82bdb44f34ad316a16639925 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 19 Mar 2024 16:39:54 +0000
Subject: [PATCH 020/391] Specialise 4-tap Neon HBD 2D dist_wtd_convolve on
 bitdepth

Add a 12-bit specialised path for the 4-tap horizontal pass of
av1_highbd_dist_wtd_convolve_2d_neon, giving up to 10% uplift for the
whole 2D convolution over the non-specialized version.

Change-Id: I997cfa3945d3920630311d8409ff32ede4f050e0
---
 .../arm/highbd_compound_convolve_sve2.c       | 172 ++++++++++++------
 1 file changed, 112 insertions(+), 60 deletions(-)

diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index 8d618fd345..1d6c9b4faf 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -977,49 +977,105 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
   } while (width != 0);
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
-                                                 int64x2_t offset,
-                                                 int32x4_t shift,
-                                                 uint16x8x2_t permute_tbl) {
-  int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
-  int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr) {
+  const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 1));
+  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+  const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
 
-  int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
-  int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
+  // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know
+  // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at
+  // a time and then process the last 3 rows separately.
 
-  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  sum0123 = vshlq_s32(sum0123, shift);
+  if (width == 4) {
+    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
 
-  return vqmovun_s32(sum0123);
-}
+    const int16_t *s = (const int16_t *)(src);
 
-static INLINE uint16x8_t highbd_convolve4_8_2d_h(int16x8_t s0[4],
-                                                 int16x8_t filter,
-                                                 int64x2_t offset,
-                                                 int32x4_t shift,
-                                                 uint16x8_t tbl) {
-  int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
-  int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
-  int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
-  int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-  int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
-  sum0415 = vshlq_s32(sum0415, shift);
+      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl);
+      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl);
+      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl);
+      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl);
 
-  int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
-  sum2637 = vshlq_s32(sum2637, shift);
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
-  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0415), vqmovun_s32(sum2637));
-  return aom_tbl_u16(res, tbl);
+      s += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    // Process final 3 rows.
+    int16x8_t s0, s1, s2;
+    load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+    uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl);
+    uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl);
+    uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl);
+
+    store_u16_4x3(dst, dst_stride, d0, d1, d2);
+
+  } else {
+    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
+
+    do {
+      const int16_t *s = (const int16_t *)(src);
+      uint16_t *d = dst;
+      int w = width;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx);
+        uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx);
+        uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx);
+        uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    // Process final 3 rows.
+    const int16_t *s = (const int16_t *)(src);
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+
+      uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx);
+      uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx);
+      uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx);
+
+      store_u16_8x3(dst, dst_stride, d0, d1, d2);
+
+      s += 8;
+      dst += 8;
+      width -= 8;
+    } while (width != 0);
+  }
 }
 
 static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    int width, int height, const int16_t *x_filter_ptr,
-    ConvolveParams *conv_params, const int bd) {
-  const int64x2_t offset = vdupq_n_s64((1 << (bd + FILTER_BITS - 1)) +
-                                       (1 << (conv_params->round_0 - 1)));
-  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+    int width, int height, const int16_t *x_filter_ptr, const int bd) {
+  const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 1));
   const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
 
@@ -1036,14 +1092,10 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
       int16x8_t s0, s1, s2, s3;
       load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-      uint16x4_t d0 =
-          highbd_convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
-      uint16x4_t d1 =
-          highbd_convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
-      uint16x4_t d2 =
-          highbd_convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
-      uint16x4_t d3 =
-          highbd_convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl);
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl);
 
       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
@@ -1056,15 +1108,11 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
     int16x8_t s0, s1, s2;
     load_s16_8x3(s, src_stride, &s0, &s1, &s2);
 
-    uint16x4_t d0 =
-        highbd_convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
-    uint16x4_t d1 =
-        highbd_convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
-    uint16x4_t d2 =
-        highbd_convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
+    uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl);
+    uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl);
+    uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl);
 
     store_u16_4x3(dst, dst_stride, d0, d1, d2);
-
   } else {
     uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
 
@@ -1080,10 +1128,10 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
         load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
         load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
 
-        uint16x8_t d0 = highbd_convolve4_8_2d_h(s0, filter, offset, shift, idx);
-        uint16x8_t d1 = highbd_convolve4_8_2d_h(s1, filter, offset, shift, idx);
-        uint16x8_t d2 = highbd_convolve4_8_2d_h(s2, filter, offset, shift, idx);
-        uint16x8_t d3 = highbd_convolve4_8_2d_h(s3, filter, offset, shift, idx);
+        uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx);
+        uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx);
+        uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx);
+        uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1105,9 +1153,9 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
       load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
       load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
 
-      uint16x8_t d0 = highbd_convolve4_8_2d_h(s0, filter, offset, shift, idx);
-      uint16x8_t d1 = highbd_convolve4_8_2d_h(s1, filter, offset, shift, idx);
-      uint16x8_t d2 = highbd_convolve4_8_2d_h(s2, filter, offset, shift, idx);
+      uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx);
+      uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx);
+      uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx);
 
       store_u16_8x3(dst, dst_stride, d0, d1, d2);
 
@@ -1446,14 +1494,18 @@ void av1_highbd_dist_wtd_convolve_2d_sve2(
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  if (x_filter_taps <= 4) {
-    highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(src_ptr, src_stride, im_block,
-                                                im_stride, w, im_h,
-                                                x_filter_ptr, conv_params, bd);
-  } else {
-    if (bd == 12) {
+  if (bd == 12) {
+    if (x_filter_taps <= 4) {
+      highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr);
+    } else {
       highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr);
+    }
+  } else {
+    if (x_filter_taps <= 4) {
+      highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd);
     } else {
       highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd);
-- 
GitLab


From 23d4875b813d4d1e7fb1e1a94a129a9c606a481c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 20 Mar 2024 14:52:44 -0700
Subject: [PATCH 021/391] rtc: Speedup for active_maps with screen

-Disable loopfilter and cdef based at frame-level
 based on percent of inactive blocks.
-Remove source_variance calculation for some blocks
 labelled as seg_skip (inactive via active_maps), mainly
 this is for blocks on the boundary of active/inactive.
-Force cdef to always skip for blocks labelled as
 inactive (seg_skip=1).
-Allow skip_over4x4 to be enabled

This change is not bitexact, but has small quality
difference. Speedup ~3% in offline test.

Change-Id: Ia0ae3788f54c436dced5cbba3aa98fcda0287ff1
---
 av1/encoder/aq_cyclicrefresh.c | 10 +++++++++-
 av1/encoder/encoder.c          |  5 ++++-
 av1/encoder/partition_search.c | 30 ++++++++++++++++++------------
 av1/encoder/speed_features.c   |  2 ++
 av1/encoder/speed_features.h   |  4 ++++
 5 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 11b6ea629b..73357eb075 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -179,6 +179,10 @@ void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
         memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
       }
     }
+  } else if (prev_segment_id == AM_SEGMENT_ID_INACTIVE) {
+    // TODO(marpan): Look into why this condition is needed
+    // (when skip_over4x4 = 1) to prevent decoder failure.
+    mbmi->segment_id = 0;
   }
   if (!dry_run) {
     if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
@@ -434,7 +438,7 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // function av1_cyclic_reset_segment_skip(). Skipping over
   // 4x4 will therefore have small bdrate loss (~0.2%), so
   // we use it only for speed > 9 for now.
-  cr->skip_over4x4 = (cpi->oxcf.speed > 9 && !cpi->active_map.enabled) ? 1 : 0;
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
 
   // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
@@ -668,6 +672,10 @@ void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
 int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int qindex = cpi->common.quant_params.base_qindex;
+  if (cpi->active_map.enabled &&
+      cpi->rc.percent_blocks_inactive >
+          cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef)
+    return 1;
   if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
       cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
       cpi->rc.frame_source_sad < 1000 &&
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index ed5f92b528..1ddbfda08b 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2394,7 +2394,10 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
 
   const int use_loopfilter =
       is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc;
-  const int use_cdef = is_cdef_used(cm);
+  const int use_cdef =
+      is_cdef_used(cm) && (!cpi->active_map.enabled ||
+                           cpi->rc.percent_blocks_inactive <=
+                               cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef);
   const int use_superres = av1_superres_scaled(cm);
   const int use_restoration = is_restoration_used(cm);
 
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index cef3397570..61d49a23f2 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -2255,6 +2255,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
   const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int i;
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
 
   // This is only needed for real time/allintra row-mt enabled multi-threaded
   // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
@@ -2277,15 +2279,17 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
   }
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 
-  x->force_zeromv_skip_for_blk =
-      get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+  if (!seg_skip) {
+    x->force_zeromv_skip_for_blk =
+        get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
 
-  // Source variance may be already compute at superblock level, so no need
-  // to recompute, unless bsize < sb_size or source_variance is not yet set.
-  if (!x->force_zeromv_skip_for_blk &&
-      (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
-    x->source_variance = av1_get_perpixel_variance_facade(
-        cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+    // Source variance may be already compute at superblock level, so no need
+    // to recompute, unless bsize < sb_size or source_variance is not yet set.
+    if (!x->force_zeromv_skip_for_blk &&
+        (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
+      x->source_variance = av1_get_perpixel_variance_facade(
+          cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+  }
 
   // Save rdmult before it might be changed, so it can be restored later.
   const int orig_rdmult = x->rdmult;
@@ -2306,7 +2310,7 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, nonrd_pick_inter_mode_sb_time);
 #endif
-    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    if (seg_skip) {
       x->force_zeromv_skip_for_blk = 1;
       // TODO(marpan): Consider adding a function for nonrd:
       // av1_nonrd_pick_inter_mode_sb_seg_skip(), instead of setting
@@ -2320,10 +2324,12 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
   if (cpi->sf.rt_sf.skip_cdef_sb) {
     // cdef_strength is initialized to 1 which means skip_cdef, and is updated
     // here. Check to see is skipping cdef is allowed.
+    // Always allow cdef_skip for seg_skip = 1.
     const int allow_cdef_skipping =
-        cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
-        !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
-          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]);
+        seg_skip ||
+        (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
+         !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+           x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]));
 
     // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
     // the block size.
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index f7242f8f95..256b6fc9eb 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1577,6 +1577,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
       sf->rt_sf.part_early_exit_zeromv = 1;
       sf->rt_sf.nonrd_aggressive_skip = 1;
+      sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
     }
     if (speed >= 11) {
       sf->rt_sf.skip_lf_screen = 2;
@@ -2275,6 +2276,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->part_early_exit_zeromv = 0;
   rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
   rt_sf->skip_lf_screen = 0;
+  rt_sf->thresh_active_maps_skip_lf_cdef = 100;
   rt_sf->sad_based_adp_altref_lag = 0;
   rt_sf->partition_direct_merging = 0;
   rt_sf->var_part_based_on_qidx = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index ef93e1d7d5..d59cb38a71 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1774,6 +1774,10 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // where rc->high_source_sad = 0 (no slide-changes).
   int skip_lf_screen;
 
+  // Threshold on the active/inactive region percent to disable
+  // the loopfilter and cdef. Setting to 100 disables this feature.
+  int thresh_active_maps_skip_lf_cdef;
+
   // For nonrd: early exit out of variance partition that sets the
   // block size to superblock size, and sets mode to zeromv-last skip.
   // 0: disabled
-- 
GitLab


From d4043476019a88e108d125f4168fd0533887af56 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 28 Mar 2024 15:18:58 -0700
Subject: [PATCH 022/391] rtc: Avoid reset of segment for inactive blocks

For active_map in rtc mode.
For skipped blocks that are labeled as inactive:
remove the reset to 0 for the segment_id. This allows
them to be skippped in loopfilter at the block level
(since SEG_LVL_ALT_LF_Y_H/V is set in av1_apply_active_map).

The fix needed to support this is to make sure the
segment_skip block uses GLOBALMV and LAST_FRAME,
as required by the bitstream. In the current code we
were allowing for NEAREST if motion vector is (0, 0)),
which is incorrect. The current patch explicitly puts
this constraint (globalmv-last) early in nonrd_pickmode.

This fix has small speedup, ~1%, with negligible quality
change.

Change-Id: Ic5e9dbd7b9bec4d57edf90cf73f44719689e1dab
---
 av1/encoder/aq_cyclicrefresh.c | 4 ----
 av1/encoder/nonrd_pickmode.c   | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 73357eb075..1aa8dde323 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -179,10 +179,6 @@ void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
         memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
       }
     }
-  } else if (prev_segment_id == AM_SEGMENT_ID_INACTIVE) {
-    // TODO(marpan): Look into why this condition is needed
-    // (when skip_over4x4 = 1) to prevent decoder failure.
-    mbmi->segment_id = 0;
   }
   if (!dry_run) {
     if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 41e2e212ae..57c74f66d5 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2357,6 +2357,10 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
     *ref_frame2 = NONE_FRAME;
   }
 
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) &&
+      (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME))
+    return true;
+
   if (x->sb_me_block && *ref_frame == LAST_FRAME) {
     // We want to make sure to test the superblock MV:
     // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
-- 
GitLab


From d07e7fddcf6e6860f9c941a28af4a81f78a8b3d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 22 Mar 2024 15:42:09 -0700
Subject: [PATCH 023/391] highbd_variance_avx2.c: fix #undef

this was missed in:
f2658a3cfe clear -Wextra-semi/-Wextra-semi-stmt warnings

SSE2_Height -> SSE2_HEIGHT

Change-Id: I728628bcb995cc90c9f7a2c1382b7f047653fb2c
---
 aom_dsp/x86/highbd_variance_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index c39c238604..adbb736cfc 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -749,7 +749,7 @@ VAR_FN(8, 32, 8, 8)
 SSE2_HEIGHT(8)
 SSE2_HEIGHT(16)
 
-#undef SSE2_Height
+#undef SSE2_HEIGHT
 
 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
-- 
GitLab


From 72bbef26ed929fc3a5ea6dae3a4069135a6aad99 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 22 Mar 2024 15:14:27 -0700
Subject: [PATCH 024/391] add aom_highbd_10_mse16x16_avx2

highbd_10_variance_avx2 has been available since:
f2b7da03c3 Add avx2 variants of highbd 8x8 and 16x16 var modules

but the corresponding mse function wasn't created. Only 16x16 is added
in this change as 8x8 is slower than sse2. There's no sse2 for 8x16 or
16x8, but it should be explored before adding avx2.

The 8 and 12 bit variants are also avoided to keep the library size
down.

Change-Id: If701627d31059ea413aba63a422fa89e0ea33e74
---
 aom_dsp/aom_dsp_rtcd_defs.pl       |  5 +++++
 aom_dsp/x86/highbd_variance_avx2.c | 11 +++++++++++
 test/variance_test.cc              |  5 +++++
 3 files changed, 21 insertions(+)

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 7d8cfb9487..7e746e9cb9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1355,6 +1355,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/;
         specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/;
         specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/;
+      } elsif ($bd eq 10) {
+        specialize "aom_highbd_${bd}_mse16x16", qw/avx2 sse2 neon sve/;
+        specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
+        specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/;
+        specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/;
       } else {
         specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/;
         specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index adbb736cfc..21e9e8b282 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -741,6 +741,17 @@ VAR_FN(8, 32, 8, 8)
 
 #undef VAR_FN
 
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          highbd_calc16x16var_avx2, 16);
+  return *sse;
+}
+
 #define SSE2_HEIGHT(H)                                                 \
   uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 11859034d9..4afc7ce62f 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -2828,6 +2828,11 @@ INSTANTIATE_TEST_SUITE_P(
                       MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10),
                       MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8),
                       MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8)));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AvxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &aom_highbd_10_mse16x16_avx2, 10)));
+#endif  // HAVE_AVX2
 
 const VarianceParams kArrayHBDVariance_sse2[] = {
   VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12),
-- 
GitLab


From c5fec7f5e6f49e6882569e4726ee798f05acf456 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 22 Mar 2024 16:36:24 -0700
Subject: [PATCH 025/391] variance_test: relocate SSE2 AvxHBDMseTest
 instantiation

move this under HAVE_SSE2 within the CONFIG_AV1_HIGHBITDEPTH block

Change-Id: Icf2e2712fd3a219b2a4f2d04131c21fc10d7e728
---
 test/variance_test.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 4afc7ce62f..261c080028 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -2794,6 +2794,15 @@ INSTANTIATE_TEST_SUITE_P(
                       MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
                       MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2,
                                       10)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2, 12),
+                      MseParams(3, 3, &aom_highbd_12_mse8x8_sse2, 12),
+                      MseParams(4, 4, &aom_highbd_10_mse16x16_sse2, 10),
+                      MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10),
+                      MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8),
+                      MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8)));
 #endif  // HAVE_SSE2
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
@@ -2820,14 +2829,6 @@ INSTANTIATE_TEST_SUITE_P(
                                 12)));
 #endif  // HAVE_SSE4_1
 
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2, 12),
-                      MseParams(3, 3, &aom_highbd_12_mse8x8_sse2, 12),
-                      MseParams(4, 4, &aom_highbd_10_mse16x16_sse2, 10),
-                      MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10),
-                      MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8),
-                      MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8)));
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxHBDMseTest,
-- 
GitLab


From 0d1a79ca0a20f6030c4d4c048c0905f3a7b20021 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 16:34:20 -0700
Subject: [PATCH 026/391] decodeframe.c: make av1_read_film_grain_params static

This quiets a -Wmissing-prototypes warning.

Bug: aomedia:3416
Change-Id: Ifb30648355a0ec28b90704465277fd8e033696a4
---
 av1/decoder/decodeframe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 650e44064e..c027308ff3 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3894,8 +3894,8 @@ static AOM_INLINE void read_bitdepth(
 #endif
 }
 
-void av1_read_film_grain_params(AV1_COMMON *cm,
-                                struct aom_read_bit_buffer *rb) {
+static void read_film_grain_params(AV1_COMMON *cm,
+                                   struct aom_read_bit_buffer *rb) {
   aom_film_grain_t *pars = &cm->film_grain_params;
   const SequenceHeader *const seq_params = cm->seq_params;
 
@@ -4063,7 +4063,7 @@ static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
   if (cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
-    av1_read_film_grain_params(cm, rb);
+    read_film_grain_params(cm, rb);
   } else {
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-- 
GitLab


From 827c8fd0dbfa95b79f19e1f8fd4c845762810c7f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 30 Mar 2024 15:34:29 -0700
Subject: [PATCH 027/391] Add aom_usage_exit OBJECT library for usage_exit.c

This removes usage_exit.c from the sources of the test_libaom and
test_intra_pred_speed executable targets so that the sources of those
two targets are all C++ files.

Change-Id: I838c08a88e17371df4458278c42d8e67419ba147
---
 CMakeLists.txt  | 11 ++++++-----
 test/test.cmake | 12 ++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b975ec59f..00a7e2bca9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -374,6 +374,7 @@ file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c"
 #
 if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
   add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  add_library(aom_usage_exit OBJECT "${AOM_GEN_SRC_DIR}/usage_exit.c")
   set_property(TARGET ${example} PROPERTY FOLDER examples)
   if(CONFIG_AV1_DECODER)
     add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
@@ -508,10 +509,10 @@ if(CONFIG_AV1_ENCODER)
       # aom_entropy_optimizer.c won't work on macos, but dragging in all the
       # helper machinery allows the link to succeed.
       add_executable(aom_entropy_optimizer
-                     "${AOM_GEN_SRC_DIR}/usage_exit.c"
                      "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
                      $<TARGET_OBJECTS:aom_common_app_util>
-                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+                     $<TARGET_OBJECTS:aom_encoder_app_util>
+                     $<TARGET_OBJECTS:aom_usage_exit>)
 
       # Maintain a list of encoder tool targets.
       list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer)
@@ -661,12 +662,12 @@ endif()
 
 if(ENABLE_TOOLS)
   if(CONFIG_AV1_DECODER)
-    add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.c"
-                            "${AOM_ROOT}/tools/dump_obu.cc"
+    add_executable(dump_obu "${AOM_ROOT}/tools/dump_obu.cc"
                             "${AOM_ROOT}/tools/obu_parser.cc"
                             "${AOM_ROOT}/tools/obu_parser.h"
                             $<TARGET_OBJECTS:aom_common_app_util>
-                            $<TARGET_OBJECTS:aom_decoder_app_util>)
+                            $<TARGET_OBJECTS:aom_decoder_app_util>
+                            $<TARGET_OBJECTS:aom_usage_exit>)
 
     list(APPEND AOM_TOOL_TARGETS dump_obu)
     list(APPEND AOM_APP_TARGETS dump_obu)
diff --git a/test/test.cmake b/test/test.cmake
index a11f7583da..e2f5da570d 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -28,8 +28,7 @@ function(add_to_libaom_test_srcs src_list_name)
   set(AOM_TEST_SOURCE_VARS "${AOM_TEST_SOURCE_VARS}" PARENT_SCOPE)
 endfunction()
 
-list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
-            "${AOM_ROOT}/test/test_libaom.cc")
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_ROOT}/test/test_libaom.cc")
 add_to_libaom_test_srcs(AOM_UNIT_TEST_WRAPPER_SOURCES)
 
 list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
@@ -102,7 +101,7 @@ add_to_libaom_test_srcs(AOM_UNIT_TEST_ENCODER_SOURCES)
 list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
 list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
 add_to_libaom_test_srcs(AOM_UNIT_TEST_WEBM_SOURCES)
-list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES
             "${AOM_ROOT}/test/test_intra_pred_speed.cc")
 
 if(CONFIG_AV1_DECODER)
@@ -462,6 +461,7 @@ function(setup_aom_test_targets)
 
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
                              $<TARGET_OBJECTS:aom_common_app_util>
+                             $<TARGET_OBJECTS:aom_usage_exit>
                              $<TARGET_OBJECTS:test_aom_common>)
   set_property(TARGET test_libaom PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   list(APPEND AOM_APP_TARGETS test_libaom)
@@ -484,9 +484,9 @@ function(setup_aom_test_targets)
     endif()
 
     if(NOT BUILD_SHARED_LIBS)
-      add_executable(test_intra_pred_speed
-                     ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
-                     $<TARGET_OBJECTS:aom_common_app_util>)
+      add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+                                           $<TARGET_OBJECTS:aom_common_app_util>
+                                           $<TARGET_OBJECTS:aom_usage_exit>)
       set_property(TARGET test_intra_pred_speed
                    PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
-- 
GitLab


From 1621d2c08f70d7992f8a84155f995090cd1623fd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 16:54:10 -0700
Subject: [PATCH 028/391] aom_dsp/x86/*.c: make some functions static

This quiets some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I998fac81b84f61736f0c506a6c6a012aaf0d10a6
---
 aom_dsp/x86/avg_intrin_sse2.c    |  2 +-
 aom_dsp/x86/intrapred_ssse3.c    |  8 ++---
 aom_dsp/x86/masked_sad4d_ssse3.c | 50 +++++++++++++++++---------------
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 9ab9143eee..0b552b704b 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -133,7 +133,7 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
   return (avg + 32) >> 6;
 }
 
-void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
   __m128i sum0, sum1, s0, s1, s2, s3, u0;
   u0 = _mm_setzero_si128();
   s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index fd48260c6f..869f880bda 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -940,10 +940,10 @@ static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
   return _mm_unpacklo_epi16((x), _mm_setzero_si128());
 }
 
-void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
-                          const uint8_t *LIBAOM_RESTRICT top_row,
-                          const uint8_t *LIBAOM_RESTRICT left_column, int width,
-                          int height) {
+static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+                                 const uint8_t *LIBAOM_RESTRICT top_row,
+                                 const uint8_t *LIBAOM_RESTRICT left_column,
+                                 int width, int height) {
   const uint8_t *const sm_weights_h = smooth_weights + height - 4;
   const uint8_t *const sm_weights_w = smooth_weights + width - 4;
   const __m128i zero = _mm_setzero_si128();
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index 799ce9ef44..d96a9dd23d 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -103,11 +103,12 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   pred = _mm_packus_epi16(pred_l, pred_r);                                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
-void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[4], int a_stride,
-                                const uint8_t *b_ptr, int b_stride,
-                                const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[4]) {
+static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_array[4], int a_stride,
+                                   const uint8_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int height, int inv_mask,
+                                   unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -164,11 +165,12 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
-void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[4], int a_stride,
-                                const uint8_t *b_ptr, int b_stride,
-                                const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[4]) {
+static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_array[4], int a_stride,
+                                   const uint8_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int height, int inv_mask,
+                                   unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -224,22 +226,22 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
                         msk_stride, m, n, inv_mask, sad_array);                \
   }
 
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  void aom_masked_sad8x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
-    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
-                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+#define MASKSAD8XN_SSSE3(n)                                                  \
+  void aom_masked_sad8x##n##x4d_ssse3(                                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
+    masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \
+                           msk, msk_stride, n, inv_mask, sad_array);         \
   }
 
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  void aom_masked_sad4x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
-    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
-                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+#define MASKSAD4XN_SSSE3(n)                                                  \
+  void aom_masked_sad4x##n##x4d_ssse3(                                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
+    masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \
+                           msk, msk_stride, n, inv_mask, sad_array);         \
   }
 
 MASKSADMXN_SSSE3(128, 128)
-- 
GitLab


From 882b392a28bae79bc89f247d2917ba3a31a9caaa Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 16:49:09 -0700
Subject: [PATCH 029/391] fwd_txfm_impl_sse2.h: add missing #if defined(FDCT*

Not all of the macros are defined depending on whether the file is
included for high-bitdepth or not. This quiets some -Wmissing-prototypes
warnings.

Bug: aomedia:3416
Change-Id: If8043406dde2cfce045ae796aa462db67620a251
---
 aom_dsp/x86/fwd_txfm_impl_sse2.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7ee8ba330e..e1db3b950c 100644
--- a/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -30,6 +30,7 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
+#if defined(FDCT4x4_2D_HELPER)
 static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
                               __m128i *in1) {
   // Constants
@@ -185,7 +186,9 @@ static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
     }
   }
 }
+#endif  // defined(FDCT4x4_2D_HELPER)
 
+#if defined(FDCT4x4_2D)
 void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   // This 2D transform implements 4 vertical 1D transforms followed
   // by 4 horizontal 1D transforms.  The multiplies and adds are as given
@@ -205,13 +208,16 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   storeu_output(&in0, output + 0 * 4);
   storeu_output(&in1, output + 2 * 4);
 }
+#endif  // defined(FDCT4x4_2D)
 
+#if defined(FDCT4x4_2D_LP)
 void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
   __m128i in0, in1;
   FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
   _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
   _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
 }
+#endif  // defined(FDCT4x4_2D_LP)
 
 #if CONFIG_INTERNAL_STATS
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
-- 
GitLab


From b47c4fc714e9732f373c98bcdf64037813569497 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 16:36:59 -0700
Subject: [PATCH 030/391] av1/encoder/*.c: make some functions static

This quiets some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I3d9bd72fdef1cab606f6c31702a814262f374cab
---
 av1/encoder/bitstream.c          | 19 +++++++++----------
 av1/encoder/encodetxb.c          | 26 +++++++++++++-------------
 av1/encoder/ethread.c            |  4 ++--
 av1/encoder/palette.c            |  2 +-
 av1/encoder/partition_strategy.c |  2 +-
 av1/encoder/temporal_filter.c    | 19 ++++++++++---------
 av1/encoder/tx_search.c          | 23 +++++++++++------------
 7 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 219784fedf..9981871147 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3391,8 +3391,8 @@ int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
   return AOM_CODEC_OK;
 }
 
-size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
-                       uint8_t *data) {
+static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                          uint8_t *data) {
   const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
   const size_t move_dst_offset = length_field_size + obu_header_size;
   const size_t move_src_offset = obu_header_size;
@@ -3581,7 +3581,7 @@ static void write_large_scale_tile_obu_size(
   *total_size += lst_obu->tg_hdr_size;
   const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
   const size_t length_field_size =
-      av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+      obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
   if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
       AOM_CODEC_OK)
     assert(0);
@@ -3806,7 +3806,7 @@ void av1_write_last_tile_info(
   const uint32_t obu_payload_size =
       (uint32_t)(*curr_tg_data_size) - obu_header_size;
   const size_t length_field_size =
-      av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+      obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
   if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
                               curr_tg_start) != AOM_CODEC_OK) {
     assert(0);
@@ -4015,8 +4015,8 @@ static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
 // to pack the smaller bitstream of such frames. This function computes the
 // number of required number of workers based on setup time overhead and job
 // dispatch time overhead for given tiles and available workers.
-int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
-                            int avail_workers, bool pack_bs_mt_enabled) {
+static int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                                   int avail_workers, bool pack_bs_mt_enabled) {
   if (!pack_bs_mt_enabled) return 1;
 
   uint64_t frame_abs_sum_level = 0;
@@ -4141,8 +4141,7 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
                                                OBU_METADATA, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
-        length_field_size =
-            av1_obu_memmove(obu_header_size, obu_payload_size, dst);
+        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
             AOM_CODEC_OK) {
           const size_t obu_size = obu_header_size + obu_payload_size;
@@ -4192,7 +4191,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
     obu_payload_size =
         av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
-        av1_obu_memmove(obu_header_size, obu_payload_size, data);
+        obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -4217,7 +4216,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
     obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
                                               data + obu_header_size, 1);
 
-    length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
+    length_field = obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 5fe2a497c7..701c5489fe 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -134,14 +134,14 @@ int av1_get_eob_pos_token(const int eob, int *const extra) {
 }
 
 #if CONFIG_ENTROPY_STATS
-void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
-                            TX_CLASS tx_class, PLANE_TYPE plane,
-                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
-                            uint8_t allow_update_cdf) {
+static void update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                               TX_CLASS tx_class, PLANE_TYPE plane,
+                               FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                               uint8_t allow_update_cdf) {
 #else
-void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
-                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
-                            uint8_t allow_update_cdf) {
+static void update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                               PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                               uint8_t allow_update_cdf) {
 #endif
   int eob_extra;
   const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
@@ -623,11 +623,11 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
     td->rd_counts.tx_type_used[tx_size][tx_type]++;
 
 #if CONFIG_ENTROPY_STATS
-    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
-                           td->counts, allow_update_cdf);
+    update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                       td->counts, allow_update_cdf);
 #else
-    av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
-                           allow_update_cdf);
+    update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                       allow_update_cdf);
 #endif
 
     DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
@@ -785,8 +785,8 @@ void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
 
 #if CONFIG_ENTROPY_STATS
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
-                           td->counts, 0 /*allow_update_cdf*/);
+    update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                       td->counts, 0 /*allow_update_cdf*/);
 
     DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
     av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 5287ded3da..755535ba51 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -2229,8 +2229,8 @@ void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
 }
 
 // Allocate memory for tpl row synchronization.
-void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
-                   int mb_rows) {
+static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+                          int mb_rows) {
   tpl_sync->rows = mb_rows;
 #if CONFIG_MULTITHREAD
   {
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 7f79e9596e..45b56199c6 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -480,7 +480,7 @@ struct ColorCount {
   int count;
 };
 
-int color_count_comp(const void *c1, const void *c2) {
+static int color_count_comp(const void *c1, const void *c2) {
   const struct ColorCount *color_count1 = (const struct ColorCount *)c1;
   const struct ColorCount *color_count2 = (const struct ColorCount *)c2;
   if (color_count1->count > color_count2->count) return -1;
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index ce06313579..1d62f128c7 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -1761,7 +1761,7 @@ void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
 
 // Decide whether to evaluate the AB partition specified by part_type based on
 // split and HORZ/VERT info
-int evaluate_ab_partition_based_on_split(
+static int evaluate_ab_partition_based_on_split(
     const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
     const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
     int split_idx2) {
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 2c19009be3..e8cc145030 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -463,12 +463,12 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
-                                   const MACROBLOCKD *mbd,
-                                   const BLOCK_SIZE block_size,
-                                   const int mb_row, const int mb_col,
-                                   const int num_planes, uint32_t *accum,
-                                   uint16_t *count) {
+static void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+                                          const MACROBLOCKD *mbd,
+                                          const BLOCK_SIZE block_size,
+                                          const int mb_row, const int mb_col,
+                                          const int num_planes, uint32_t *accum,
+                                          uint16_t *count) {
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -564,9 +564,10 @@ static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
 // Returns:
 //   Nothing will be returned. But the content to which `luma_sse_sum` points
 //   will be modified.
-void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
-                               int block_height, int block_width,
-                               int ss_x_shift, int ss_y_shift) {
+static void compute_luma_sq_error_sum(uint32_t *square_diff,
+                                      uint32_t *luma_sse_sum, int block_height,
+                                      int block_width, int ss_x_shift,
+                                      int ss_y_shift) {
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
       for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 7292c01191..5dcc08c0ff 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -1109,13 +1109,11 @@ static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
   *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
 }
 
-uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                              int block, TX_SIZE tx_size, int blk_row,
-                              int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
-                              int16_t allowed_tx_mask, int prune_factor,
-                              const TXB_CTX *const txb_ctx,
-                              int reduced_tx_set_used, int64_t ref_best_rd,
-                              int num_sel) {
+static uint16_t prune_txk_type_separ(
+    const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+    int blk_row, int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+    int16_t allowed_tx_mask, int prune_factor, const TXB_CTX *const txb_ctx,
+    int reduced_tx_set_used, int64_t ref_best_rd, int num_sel) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1255,11 +1253,12 @@ uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   return prune;
 }
 
-uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                        int block, TX_SIZE tx_size, int blk_row, int blk_col,
-                        BLOCK_SIZE plane_bsize, int *txk_map,
-                        uint16_t allowed_tx_mask, int prune_factor,
-                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+static uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, TX_SIZE tx_size, int blk_row,
+                               int blk_col, BLOCK_SIZE plane_bsize,
+                               int *txk_map, uint16_t allowed_tx_mask,
+                               int prune_factor, const TXB_CTX *const txb_ctx,
+                               int reduced_tx_set_used) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int tx_type;
-- 
GitLab


From 94dbb8911d656e2525044bc996d0f8fb5c40176a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 13:59:43 -0700
Subject: [PATCH 031/391] encode_strategy.c: remove unused setup_mi()

The reference to this function was removed in:
65fedadcfc Simplify temporal filtering preparation

Bug: aomedia:3416
Change-Id: I4dcd2893744d8a926a4f19dcf20b623063cdb453
---
 av1/encoder/encode_strategy.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 622fd64a67..db77dc0e3c 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -712,20 +712,6 @@ int av1_get_refresh_frame_flags(
 }
 
 #if !CONFIG_REALTIME_ONLY
-void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
-
-  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
-                         cm->seq_params->subsampling_y, num_planes);
-
-  set_mi_offsets(&cm->mi_params, xd, 0, 0);
-}
-
 // Apply temporal filtering to source frames and encode the filtered frame.
 // If the current frame does not require filtering, this function is identical
 // to av1_encode() except that tpl is not performed.
-- 
GitLab


From 874b7ca3adae0c408b06ce3982e8c4ad432bdd98 Mon Sep 17 00:00:00 2001
From: Rachel Barker <rachelbarker@google.com>
Date: Tue, 2 Apr 2024 17:43:11 +0000
Subject: [PATCH 032/391] disflow_avx2.c: Make compatible with gcc <= 9

Per the linked bug report, the _mm256_loadu2_m128i() intrinsic
was only added in gcc 10. Therefore, for compatibility with gcc 9
and earlier, we must instead use our own implementation of this
intrinsic, which we call yy_loadu2_128().

Bug: aomedia:3550
Change-Id: I8a4220acaaddeb6dcdd8fd918cd386c432a56bfc
---
 aom_dsp/flow_estimation/x86/disflow_avx2.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c
index e210042d6f..ad5a1bd7c6 100644
--- a/aom_dsp/flow_estimation/x86/disflow_avx2.c
+++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -145,7 +145,7 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
     // for a total of 11 pixels. Here we load 16 pixels, but only use
     // the first 11.
     __m256i row =
-        _mm256_loadu2_m128i((__m128i *)(ref_row + stride), (__m128i *)ref_row);
+        yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row);
 
     // Expand pixels to int16s
     // We must use unpacks here, as we have one row in each 128-bit lane
@@ -273,8 +273,8 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
 
   // Loop setup: Load the first two rows (of 10 input rows) and apply
   // the horizontal parts of the two filters
-  __m256i row_m1_0 = _mm256_loadu2_m128i((__m128i *)(src - 1),
-                                         (__m128i *)(src - src_stride - 1));
+  __m256i row_m1_0 =
+      yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1));
   __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero);
   __m256i row_m1_0_b =
       _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero);
@@ -293,8 +293,8 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
   for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) {
     // Load rows (i+1, i+2) and apply both horizontal filters
     const __m256i row_p1_p2 =
-        _mm256_loadu2_m128i((__m128i *)(src + (i + 2) * src_stride - 1),
-                            (__m128i *)(src + (i + 1) * src_stride - 1));
+        yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1),
+                      (__m128i *)(src + (i + 1) * src_stride - 1));
     const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero);
     const __m256i row_p1_p2_b =
         _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero);
-- 
GitLab


From 879d14159441796c92f3bbba7f8965e1bcf320ca Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 29 Mar 2024 14:02:48 -0700
Subject: [PATCH 033/391] cnn{,_avx2}.c: make some functions static

This quiets some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Ifadbf15b5ee30636f89fe56ee09a40b0f3c3f362
---
 av1/encoder/cnn.c          | 10 ++++++----
 av1/encoder/x86/cnn_avx2.c |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c
index 598b362753..b019ace685 100644
--- a/av1/encoder/cnn.c
+++ b/av1/encoder/cnn.c
@@ -138,14 +138,16 @@ static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
   return true;
 }
 
-int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+#ifndef NDEBUG
+static int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
   return (t1->width == t2->width && t1->height == t2->height);
 }
 
-int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+static int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
   return (t1->channels == t2->channels && t1->width == t2->width &&
           t1->height == t2->height);
 }
+#endif  // NDEBUG
 
 void av1_find_cnn_layer_output_size(int in_width, int in_height,
                                     const CNN_LAYER_CONFIG *layer_config,
@@ -189,8 +191,8 @@ void av1_find_cnn_layer_output_size(int in_width, int in_height,
   }
 }
 
-void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
-                           int channels_per_branch[]) {
+static void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+                                  int channels_per_branch[]) {
   int branch = layer_config->branch;
   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c
index ee93b3d5a0..9c26a56641 100644
--- a/av1/encoder/x86/cnn_avx2.c
+++ b/av1/encoder/x86/cnn_avx2.c
@@ -466,7 +466,7 @@ static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
 // As per the layer config set by av1_intra_mode_cnn_partition_cnn_config,
 // the filter_width and filter_height are equal to 2 for layer >= 1. So
 // convolution happens at 2x2 for layer >= 1.
-void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+static void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
     const float **input, int in_width, int in_height, int in_stride,
     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
     int start_idx, const int cstep, const int channel_step) {
-- 
GitLab


From 04d6253b0c1689f49563a6da9ce047bd3e1b584f Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 2 Apr 2024 18:06:24 +0000
Subject: [PATCH 034/391] Move force_zeromv_skip logic up in nonrd_pickmode

Move the skip logic for force_zeromv_for_blk further up
in the nonrd_pickmode, to avoid possibel conflict with
the sb_me_block feature, which has an early exit that
conflict with force_zeromv_skip. Currently this has no
effect, but change it now in case the sb_me_block feature
is modified.

Bitexact on speed 6-11 for screen, with neutral
IC speedup.

Change-Id: I94160ac6c6dd2b7a4025c20bcf62d2c9e97eff65
---
 av1/encoder/nonrd_pickmode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 57c74f66d5..ff8f6af564 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2361,6 +2361,15 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
       (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME))
     return true;
 
+  // Skip mode for some modes and reference frames when
+  // force_zeromv_skip_for_blk flag is true.
+  if (x->force_zeromv_skip_for_blk &&
+      ((!(*this_mode == NEARESTMV &&
+          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+        *this_mode != GLOBALMV) ||
+       *ref_frame != LAST_FRAME))
+    return true;
+
   if (x->sb_me_block && *ref_frame == LAST_FRAME) {
     // We want to make sure to test the superblock MV:
     // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
@@ -2403,15 +2412,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
   // Skip the mode if use reference frame mask flag is not set.
   if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
 
-  // Skip mode for some modes and reference frames when
-  // force_zeromv_skip_for_blk flag is true.
-  if (x->force_zeromv_skip_for_blk &&
-      ((!(*this_mode == NEARESTMV &&
-          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
-        *this_mode != GLOBALMV) ||
-       *ref_frame != LAST_FRAME))
-    return true;
-
   // Skip compound mode based on variance of previously evaluated single
   // reference modes.
   if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
-- 
GitLab


From 7aa2edc2b09f98c32820923d813fd73eb23b5861 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 3 Apr 2024 20:08:16 +0000
Subject: [PATCH 035/391] Fix integer overflows in calc of stride_in_bytes

Fix unsigned integer overflows in the calculation of stride_in_bytes in
img_alloc_helper() when d_w is huge.

Change the type of stride_in_bytes from unsigned int to int because it
will be assigned to img->stride[AOM_PLANE_Y], which is of the int type.

Test:
cmake ../aom -G Ninja -DCMAKE_C_COMPILER=clang \
  -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug \
  -DSANITIZE=unsigned-integer-overflow
ninja
./test_libaom --gtest_filter=AomImageTest.AomImgAllocHugeWidth

Bug: chromium:332382766
Change-Id: Iaccb83bcd13ddc3ea5e6f01da91bb01215ddb461
---
 aom/src/aom_image.c    | 15 ++++++++-------
 test/aom_image_test.cc | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 3b1c33d056..b68dc4c8fd 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -36,8 +36,7 @@ static aom_image_t *img_alloc_helper(
   /* NOTE: In this function, bit_depth is either 8 or 16 (if
    * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
    */
-  unsigned int h, w, s, xcs, ycs, bps, bit_depth;
-  unsigned int stride_in_bytes;
+  unsigned int h, w, xcs, ycs, bps, bit_depth;
 
   if (img != NULL) memset(img, 0, sizeof(aom_image_t));
 
@@ -108,9 +107,11 @@ static aom_image_t *img_alloc_helper(
   w = align_image_dimension(d_w, xcs, size_align);
   h = align_image_dimension(d_h, ycs, size_align);
 
-  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth;
+  uint64_t s = (fmt & AOM_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / bit_depth;
   s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
-  stride_in_bytes = s * bit_depth / 8;
+  s = s * bit_depth / 8;
+  if (s > INT_MAX) goto fail;
+  const int stride_in_bytes = (int)s;
 
   /* Allocate the new image */
   if (!img) {
@@ -232,7 +233,7 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
 
       img->planes[AOM_PLANE_Y] =
           data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
-      data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y];
+      data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y];
 
       unsigned int uv_border_h = border >> img->y_chroma_shift;
       unsigned int uv_x = x >> img->x_chroma_shift;
@@ -244,14 +245,14 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
       } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
         img->planes[AOM_PLANE_U] =
             data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
-        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+        data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
                 img->stride[AOM_PLANE_U];
         img->planes[AOM_PLANE_V] =
             data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
       } else {
         img->planes[AOM_PLANE_V] =
             data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
-        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+        data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
                 img->stride[AOM_PLANE_V];
         img->planes[AOM_PLANE_U] =
             data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
index 03f4373f35..62f3c12747 100644
--- a/test/aom_image_test.cc
+++ b/test/aom_image_test.cc
@@ -70,3 +70,39 @@ TEST(AomImageTest, AomImgAllocNv12) {
   EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr);
   aom_img_free(&img);
 }
+
+TEST(AomImageTest, AomImgAllocHugeWidth) {
+  // The stride (0x80000000 * 2) would overflow unsigned int.
+  aom_image_t *image =
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 0x80000000, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  // The stride (0x80000000) would overflow int.
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1);
+  if (image) {
+    aom_img_free(image);
+  }
+
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 285245883, 64, 1);
+  if (image) {
+    aom_img_free(image);
+  }
+
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_NV12, 285245883, 64, 1);
+  if (image) {
+    aom_img_free(image);
+  }
+
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_YV12, 285245883, 64, 1);
+  if (image) {
+    aom_img_free(image);
+  }
+
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1);
+  if (image) {
+    aom_img_free(image);
+  }
+}
-- 
GitLab


From b53a6307466418464605c858a5f5e2347ce805b0 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Mon, 1 Apr 2024 12:13:24 -0700
Subject: [PATCH 036/391] Increase filter buffer size to silence warnings

The asan reports global-buffer-overflow on address of the
filter coefficient buffer.

BUG=b/324147074

Change-Id: Idf6db1394f2df26e6802f13073d34cd97a7ce8dc
---
 av1/encoder/tune_vmaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 91db3db726..847571f896 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -247,7 +247,7 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
 
 // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
 // all co-efficients must be even.
-DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0,  8, 30, 52,
                                                                30, 8, 0,  0 };
 static AOM_INLINE void gaussian_blur(const int bit_depth,
                                      const YV12_BUFFER_CONFIG *source,
-- 
GitLab


From 19924274ab14db1a1132a8128e5ae7b1227743c5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 4 Apr 2024 10:50:54 -0700
Subject: [PATCH 037/391] Add missing header for EBUSY on mingw

The `error: use of undeclared identifier 'EBUSY'` in
vpx_util/vpx_pthread.h was found in Mozilla's bug 1886318 [1]. This
patch addresses the issue by adding the `<errno.h>` header to introduce
the `EBUSY` identifier, resolving the problem.

This patch is based on the change in libvpx:
0752960c6 Add missing header for EBUSY on mingw
https://chromium-review.googlesource.com/c/webm/libvpx/+/5425372

[1] https://bugzilla.mozilla.org/show_bug.cgi?id=1886318#c1

Change-Id: I90bd6b605613094fe2decfa48831f98941108a6c
---
 aom_util/aom_pthread.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aom_util/aom_pthread.h b/aom_util/aom_pthread.h
index 827b9c2316..1a97a0a9db 100644
--- a/aom_util/aom_pthread.h
+++ b/aom_util/aom_pthread.h
@@ -28,6 +28,7 @@ extern "C" {
 #define NOMINMAX
 #undef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#include <errno.h>    // NOLINT
 #include <process.h>  // NOLINT
 #include <stddef.h>   // NOLINT
 #include <windows.h>  // NOLINT
-- 
GitLab


From 4c76a0d534d0e3a787d94d6803665f7383928798 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 4 Apr 2024 11:51:23 -0700
Subject: [PATCH 038/391] Rename JobInfo as GlobalMotionJobInfo

The JobInfo struct defined in global_motion.h is renamed
GlobalMotionJobInfo.

Change-Id: I607d2d647517f80a36eba59d6365d1a625c57dce
---
 av1/encoder/ethread.c       | 6 +++---
 av1/encoder/global_motion.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 755535ba51..3cc8e20f97 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -2520,7 +2520,7 @@ void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
 static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
                                       int cur_dir) {
   GlobalMotionInfo *gm_info = &cpi->gm_info;
-  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+  GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
 
   int total_refs = gm_info->num_ref_frames[cur_dir];
   int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
@@ -2551,7 +2551,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
   AV1_COMP *cpi = thread_data->cpi;
   GlobalMotionInfo *gm_info = &cpi->gm_info;
   AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
-  JobInfo *job_info = &gm_sync->job_info;
+  GlobalMotionJobInfo *job_info = &gm_sync->job_info;
   int thread_id = thread_data->thread_id;
   GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
 #if CONFIG_MULTITHREAD
@@ -2689,7 +2689,7 @@ static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
 
 // Implements multi-threading for global motion.
 void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
-  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+  GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
 
   av1_zero(*job_info);
 
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index de46a0e1f2..dc18940752 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -58,11 +58,11 @@ typedef struct {
   // next_frame_to_process[i] will hold the count of next reference frame to be
   // processed in the direction 'i'.
   int8_t next_frame_to_process[MAX_DIRECTIONS];
-} JobInfo;
+} GlobalMotionJobInfo;
 
 typedef struct {
   // Data related to assigning jobs for global motion multi-threading.
-  JobInfo job_info;
+  GlobalMotionJobInfo job_info;
 
 #if CONFIG_MULTITHREAD
   // Mutex lock used while dispatching jobs.
-- 
GitLab


From a30448b2b1f5faa58913033a4529b07666f0da34 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 4 Apr 2024 14:06:32 -0700
Subject: [PATCH 039/391] Fix keyframe logic for use_fast_part_feature for svc

This CL also fixes an issue with spatial layers
on key frames for the rtc_sf->use_fast_fixed_part feature:
avoid entering the fixed partition for layers whose base
is key frame.

Change-Id: I2adc953d8ac6467a7b5dd47b75f1a418b7f32006
---
 av1/encoder/encodeframe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index a9214f77c2..07382eb6cc 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -537,7 +537,9 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
   // Set the partition
   if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
       (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 &&
-       !frame_is_intra_only(cm))) {
+       (!frame_is_intra_only(cm) &&
+        (!cpi->ppi->use_svc ||
+         !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) {
     // set a fixed-size partition
     av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
-- 
GitLab


From cb359e533a8f81b65f50d07f936d8e1146e19618 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 3 Apr 2024 14:41:12 -0700
Subject: [PATCH 040/391] rtc: Increase loopfilter strength for speed 11

This is for speed 11 video mode, vga, where
the speed feature rt_sf->use_fast_fixed_part is used.
Increase the loopfilter strength to reduce artifacts.

Stats change for speed 11 rtc (480/360p only):
avg/ovr/ssim, IC speedup
-2.98/-2.98/-2.57, -0.325

Change-Id: I59bb2594168e93687e80a2b60b4848c105fa066e
---
 av1/encoder/picklpf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index a504535028..ce0357163d 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -257,6 +257,8 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
         inter_frame_multiplier = inter_frame_multiplier << 1;
       else if (cpi->rc.frame_source_sad > 50000)
         inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1);
+    } else if (cpi->sf.rt_sf.use_fast_fixed_part) {
+      inter_frame_multiplier = inter_frame_multiplier << 1;
     }
     // These values were determined by linear fitting the result of the
     // searched level for 8 bit depth:
-- 
GitLab


From e937f26e1e6cd69401070ba68a1e73857d3e1132 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 15 Feb 2024 15:44:28 -0800
Subject: [PATCH 041/391] Define the MAX_NUM_THREADS macro in enc_enums.h

The MAX_NUM_THREADS macro is unrelated to the AVxWorkerInterface, so it
doesn't need to be defined in aom_util/aom_thread.h. Move the definition
of the MAX_NUM_THREADS macro to av1/encoder/enc_enums.h.

Note: The ideal place to define the MAX_NUM_THREADS macro is
av1/encoder/ethread.h. Unfortunately av1/encoder/ethread.h must be
included after av1/encoder/encoder.h, but av1/encoder/encoder.h uses the
MAX_NUM_THREADS macro. I could not fix the mutual dependency.

Bug: aomedia:3554
Change-Id: I6add9d8a920fe7a0095b79aeebd2abe4a4f625be
---
 aom_util/aom_thread.h       | 2 --
 av1/av1_cx_iface.c          | 1 +
 av1/encoder/enc_enums.h     | 4 ++++
 av1/encoder/encoder.h       | 2 +-
 av1/encoder/ethread.c       | 1 +
 av1/encoder/global_motion.h | 3 +--
 test/ethread_test.cc        | 5 ++---
 7 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h
index 92e162f121..80ed314752 100644
--- a/aom_util/aom_thread.h
+++ b/aom_util/aom_thread.h
@@ -21,8 +21,6 @@
 extern "C" {
 #endif
 
-#define MAX_NUM_THREADS 64
-
 // State of the worker thread object
 typedef enum {
   AVX_WORKER_STATUS_NOT_OK = 0,  // object is unusable
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 2b6b1504e6..39c03c9ecb 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -32,6 +32,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/scale.h"
 #include "av1/encoder/bitstream.h"
+#include "av1/encoder/enc_enums.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_alloc.h"
 #include "av1/encoder/encoder_utils.h"
diff --git a/av1/encoder/enc_enums.h b/av1/encoder/enc_enums.h
index 20cefa16a5..0a8b0f258a 100644
--- a/av1/encoder/enc_enums.h
+++ b/av1/encoder/enc_enums.h
@@ -12,10 +12,14 @@
 #ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
 #define AOM_AV1_ENCODER_ENC_ENUMS_H_
 
+#include "aom_ports/mem.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_NUM_THREADS 64
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
 enum {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 4de5d426ce..a919bd906a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -37,6 +37,7 @@
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/enc_enums.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
@@ -74,7 +75,6 @@
 #endif
 
 #include "aom/internal/aom_codec_internal.h"
-#include "aom_util/aom_thread.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 3cc8e20f97..1d0092a5ed 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -19,6 +19,7 @@
 
 #include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/bitstream.h"
+#include "av1/encoder/enc_enums.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encoder.h"
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index dc18940752..2645f93e3c 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -14,9 +14,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/flow_estimation/flow_estimation.h"
-#include "aom_scale/yv12config.h"
 #include "aom_util/aom_pthread.h"
-#include "aom_util/aom_thread.h"
+#include "av1/encoder/enc_enums.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index ce45394eb8..415f5de269 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -18,6 +18,7 @@
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "av1/encoder/enc_enums.h"
 #include "av1/encoder/firstpass.h"
 
 namespace {
@@ -411,9 +412,7 @@ class AVxEncoderThreadTest
                                 const std::vector<size_t> ref_size_enc,
                                 const std::vector<std::string> ref_md5_enc,
                                 const std::vector<std::string> ref_md5_dec) {
-    // This value should be kept the same as MAX_NUM_THREADS
-    // in aom_thread.h
-    cfg_.g_threads = 64;
+    cfg_.g_threads = MAX_NUM_THREADS;
     ASSERT_NO_FATAL_FAILURE(RunLoop(video));
     std::vector<size_t> multi_thr_max_row_mt_size_enc;
     std::vector<std::string> multi_thr_max_row_mt_md5_enc;
-- 
GitLab


From d3d5e945a147cc184efee39093414ca3d72cd724 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Thu, 4 Apr 2024 11:22:43 -0700
Subject: [PATCH 042/391] Adding comment to gaussian filter buffer alloc

Bug: b:324147074

Change-Id: I1a2ca0f17f171ef1b4d447ef8421da3c765df28d
---
 av1/encoder/tune_vmaf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 847571f896..fdb7c77ebc 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -247,6 +247,8 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
 
 // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
 // all co-efficients must be even.
+// The array is of size 9 to allow passing gauss_filter + 1 to
+// _mm_loadu_si128() in prepare_coeffs_6t().
 DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0,  8, 30, 52,
                                                                30, 8, 0,  0 };
 static AOM_INLINE void gaussian_blur(const int bit_depth,
-- 
GitLab


From 6bd02548ed8f711504f660bdc829f9ba07af6d22 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 5 Apr 2024 02:51:35 +0000
Subject: [PATCH 043/391] Revert "Move force_zeromv_skip logic up in
 nonrd_pickmode"

This reverts commit 04d6253b0c1689f49563a6da9ce047bd3e1b584f.

Reason for revert:
SVC3TL1SLScreen valgrind failure. Will look into it later.

Bug: aomedia:3555
Change-Id: Ibc60781237e05cd1c47b7f3620ecbf86512128c2
---
 av1/encoder/nonrd_pickmode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index ff8f6af564..57c74f66d5 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2361,15 +2361,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
       (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME))
     return true;
 
-  // Skip mode for some modes and reference frames when
-  // force_zeromv_skip_for_blk flag is true.
-  if (x->force_zeromv_skip_for_blk &&
-      ((!(*this_mode == NEARESTMV &&
-          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
-        *this_mode != GLOBALMV) ||
-       *ref_frame != LAST_FRAME))
-    return true;
-
   if (x->sb_me_block && *ref_frame == LAST_FRAME) {
     // We want to make sure to test the superblock MV:
     // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
@@ -2412,6 +2403,15 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
   // Skip the mode if use reference frame mask flag is not set.
   if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
 
+  // Skip mode for some modes and reference frames when
+  // force_zeromv_skip_for_blk flag is true.
+  if (x->force_zeromv_skip_for_blk &&
+      ((!(*this_mode == NEARESTMV &&
+          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+        *this_mode != GLOBALMV) ||
+       *ref_frame != LAST_FRAME))
+    return true;
+
   // Skip compound mode based on variance of previously evaluated single
   // reference modes.
   if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
-- 
GitLab


From f53607cb4194cdb3152677fecf4e024dcb128ef5 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 21 Mar 2024 10:18:01 +0000
Subject: [PATCH 044/391] Add SVE implementation of av1_compute_stats

Add SVE implementation of av1_compute_stats as well as the corresponding
tests. This gives between 30% and 45% uplift over the Neon
implementation.

Change-Id: I45ba993e85d9b43ba1beefcc3f75a6b1e44ba280
---
 av1/av1.cmake                      |   1 +
 av1/common/av1_rtcd_defs.pl        |   2 +-
 av1/encoder/arm/neon/pickrst_sve.c | 590 +++++++++++++++++++++++++++++
 test/wiener_test.cc                |   6 +
 4 files changed, 598 insertions(+), 1 deletion(-)
 create mode 100644 av1/encoder/arm/neon/pickrst_sve.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 32645f6065..c1206e9d68 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -375,6 +375,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c0831330d1..8bf4b0709c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -458,7 +458,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
-    specialize qw/av1_compute_stats sse4_1 avx2 neon/;
+    specialize qw/av1_compute_stats sse4_1 avx2 neon sve/;
     add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
     specialize qw/av1_calc_proj_params sse4_1 avx2 neon/;
     add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
diff --git a/av1/encoder/arm/neon/pickrst_sve.c b/av1/encoder/arm/neon/pickrst_sve.c
new file mode 100644
index 0000000000..a519ecc5f5
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_sve.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
+                                       int width, int height) {
+  uint32x4_t avg_u32 = vdupq_n_u32(0);
+  uint8x16_t ones = vdupq_n_u8(1);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b8_u32(0, width % 16);
+
+  int h = height;
+  do {
+    int j = width;
+    const uint8_t *src_ptr = src;
+    while (j >= 16) {
+      uint8x16_t s = vld1q_u8(src_ptr);
+      avg_u32 = vdotq_u32(avg_u32, s, ones);
+
+      j -= 16;
+      src_ptr += 16;
+    }
+    uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr));
+    avg_u32 = vdotq_u32(avg_u32, s_end, ones);
+
+    src += src_stride;
+  } while (--h != 0);
+  return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
+                                   int16_t *buf_avg, int buf_avg_stride,
+                                   int width, int height,
+                                   int downsample_factor) {
+  uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b8_u32(0, width % 8);
+
+  uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg)));
+
+  do {
+    int j = width;
+    const uint8_t *buf_ptr = buf;
+    int16_t *buf_avg_ptr = buf_avg;
+    while (j >= 8) {
+      uint8x8_t d = vld1_u8(buf_ptr);
+      vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
+
+      j -= 8;
+      buf_ptr += 8;
+      buf_avg_ptr += 8;
+    }
+    uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr)));
+    vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end)));
+
+    buf += buf_stride;
+    buf_avg += buf_avg_stride;
+    height -= downsample_factor;
+  } while (height > 0);
+}
+
+static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+                                       const int wiener_win2, const int scale) {
+  for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+    // Transpose the first 2x2 square. It needs a special case as the element
+    // of the bottom left is on the diagonal.
+    int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
+    int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
+
+    int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+    vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+    vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
+
+    // Transpose and store all the remaining 2x2 squares of the line.
+    for (int j = i + 3; j < wiener_win2; j = j + 2) {
+      row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
+      row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
+
+      int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+      int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+      vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0);
+      vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
+    }
+  }
+  for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
+    H[i] += H_tmp[i] * scale;
+  }
+}
+
+// Transpose the matrix that has just been computed and accumulate it in M.
+static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+                                   const int wiener_win, int scale) {
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = 0; j < wiener_win; ++j) {
+      int tr_idx = j * wiener_win + i;
+      *M++ += (int64_t)(M_trn[tr_idx] * scale);
+    }
+  }
+}
+
+// Swap each half of the dgd vectors so that we can accumulate the result of
+// the dot-products directly in the destination matrix.
+static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
+  int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
+      vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+  int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
+      vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+
+  return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
+}
+
+static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
+                                          int64_t *M, int row) {
+  const int wiener_win = 5;
+
+  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
+  M[row * wiener_win + 4] += vaddvq_s64(m4);
+}
+
+static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
+                                          int64_t *M, int row) {
+  const int wiener_win = 7;
+
+  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+  int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
+  int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
+
+  int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
+  cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 4, cross_corr45);
+
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
+  M[row * wiener_win + 6] += vaddvq_s64(m6);
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+                                     const int wiener_win,
+                                     const int wiener_win2) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = row0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+      int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
+      H[auto_cov_idx] += vaddvq_s64(auto_cov);
+    }
+  }
+}
+
+static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
+                                           int row0, int row1, int64_t *H) {
+  for (int col0 = 0; col0 < 5; col0++) {
+    int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
+
+    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+    int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
+    H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
+  }
+}
+
+static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
+                                           int row0, int row1, int64_t *H) {
+  for (int col0 = 0; col0 < 7; col0++) {
+    int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
+
+    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+    int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
+    int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
+
+    int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
+    auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
+
+    int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
+    H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
+  }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
+                                          int16_t *src_avg, int src_avg_stride,
+                                          int width, int height, int64_t *M,
+                                          int64_t *H, int downsample_factor) {
+  const int wiener_win = 7;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[49];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[49 * 49];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[7];
+        load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[7];
+        load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    if (j < width) {
+      // Process remaining columns using a predicate to discard excess elements.
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[7];
+        dgd0[0] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+        dgd0[1] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+        dgd0[2] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+        dgd0[3] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+        dgd0[4] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+        dgd0[5] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
+        dgd0[6] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+    }
+    dgd_avg += downsample_factor * dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 7, downsample_factor);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride,
+                                          int16_t *src_avg, int src_avg_stride,
+                                          int width, int height, int64_t *M,
+                                          int64_t *H, int downsample_factor) {
+  const int wiener_win = 5;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[25];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[25 * 25];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[5];
+        load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4]);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4]);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a predicate to discard excess elements.
+    if (j < width) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[5];
+        dgd0[0] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+        dgd0[1] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+        dgd0[2] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+        dgd0[3] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+        dgd0[4] = svget_neonq_s16(
+            svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4]);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+    }
+    dgd_avg += downsample_factor * dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 5, downsample_factor);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
+}
+
+void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd,
+                           const uint8_t *src, int16_t *dgd_avg,
+                           int16_t *src_avg, int h_start, int h_end,
+                           int v_start, int v_end, int dgd_stride,
+                           int src_stride, int64_t *M, int64_t *H,
+                           int use_downsampled_wiener_stats) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = wiener_win >> 1;
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+  const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
+
+  const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
+  const int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+  // dgd_avg and src_avg have been memset to zero before calling this
+  // function, so round up the stride to the next multiple of 8 so that we
+  // don't have to worry about a tail loop when computing M.
+  const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
+  const int src_avg_stride = (width & ~7) + 8;
+
+  // Compute (dgd - avg) and store it in dgd_avg.
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
+
+  // Compute (src - avg), downsample if necessary and store in src-avg.
+  const uint8_t *src_start = src + h_start + v_start * src_stride;
+  compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
+                  src_avg_stride, width, height, downsample_factor);
+
+  const int downsample_height = height / downsample_factor;
+
+  // Since the height is not necessarily a multiple of the downsample factor,
+  // the last line of src will be scaled according to how many rows remain.
+  const int downsample_remainder = height % downsample_factor;
+
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                           width, downsample_height, M, H, downsample_factor);
+  } else {
+    compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                           width, downsample_height, M, H, downsample_factor);
+  }
+
+  if (downsample_remainder > 0) {
+    const int remainder_offset = height - downsample_remainder;
+    if (wiener_win == WIENER_WIN) {
+      compute_stats_win7_sve(
+          dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
+          src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
+          1, M, H, downsample_remainder);
+    } else {
+      compute_stats_win5_sve(
+          dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
+          src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
+          1, M, H, downsample_remainder);
+    }
+  }
+}
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index b995c84d8f..2886ed77df 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -397,6 +397,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, WienerTest,
                          ::testing::Values(av1_compute_stats_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(SVE, WienerTest,
+                         ::testing::Values(av1_compute_stats_sve));
+#endif  // HAVE_SVE
+
 }  // namespace wiener_lowbd
 
 #if CONFIG_AV1_HIGHBITDEPTH
-- 
GitLab


From 608f9241e2d426b8817b7036457f10c336a69965 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 8 Apr 2024 10:34:11 -0700
Subject: [PATCH 045/391] rtc: Move force_zeromv_skip logic up in
 nonrd_pickmode

Move the skip logic for force_zeromv_for_blk further up
in the nonrd_pickmode, to avoid possible conflict with
the sb_me_block feature, which has an early exit that
conflict with force_zeromv_skip. Currently this has no
effect, but change it now in case the sb_me_block feature
is modified.

We also need to move up the check on use_ref_frame_mask,
otherwise the frame_mv (in the force_zeromv_skip_for_blk check)
may not be set. This is a re-application of the patch
https://aomedia-review.googlesource.com/c/aom/+/188741, with
the fix for the valgrind issue.

Bitexact on screen, with neutral IC speedup.

Change-Id: I33d4fcb25022abf43a109e7412cf7ea6b12f053b
---
 av1/encoder/nonrd_pickmode.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 57c74f66d5..fdc10ded57 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2361,6 +2361,18 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
       (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME))
     return true;
 
+  // Skip the mode if use reference frame mask flag is not set.
+  if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+  // Skip mode for some modes and reference frames when
+  // force_zeromv_skip_for_blk flag is true.
+  if (x->force_zeromv_skip_for_blk &&
+      ((!(*this_mode == NEARESTMV &&
+          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+        *this_mode != GLOBALMV) ||
+       *ref_frame != LAST_FRAME))
+    return true;
+
   if (x->sb_me_block && *ref_frame == LAST_FRAME) {
     // We want to make sure to test the superblock MV:
     // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
@@ -2400,18 +2412,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
   mi->ref_frame[0] = *ref_frame;
   mi->ref_frame[1] = *ref_frame2;
 
-  // Skip the mode if use reference frame mask flag is not set.
-  if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
-
-  // Skip mode for some modes and reference frames when
-  // force_zeromv_skip_for_blk flag is true.
-  if (x->force_zeromv_skip_for_blk &&
-      ((!(*this_mode == NEARESTMV &&
-          search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
-        *this_mode != GLOBALMV) ||
-       *ref_frame != LAST_FRAME))
-    return true;
-
   // Skip compound mode based on variance of previously evaluated single
   // reference modes.
   if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
-- 
GitLab


From 18558f8724d204f60423e9123b0a3570840adcbc Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 2 Apr 2024 12:20:33 +0100
Subject: [PATCH 046/391] Add SVE implementation of aom_compute_flow_at_point

Add SVE implementation of aom_compute_flow_at_point, as well as the
corresponding tests. This gives around 20% uplift for this function over
the Neon implementation.

Some functions are common between the Neon and SVE implementations, so
move them to a separate header file.

Change-Id: Ia7afd60626c3883b5e49250ca9b068da40ad1374
---
 aom_dsp/aom_dsp.cmake                      |   3 +
 aom_dsp/aom_dsp_rtcd_defs.pl               |   2 +-
 aom_dsp/flow_estimation/arm/disflow_neon.c | 104 +-------
 aom_dsp/flow_estimation/arm/disflow_neon.h | 127 ++++++++++
 aom_dsp/flow_estimation/arm/disflow_sve.c  | 268 +++++++++++++++++++++
 test/disflow_test.cc                       |   5 +
 6 files changed, 405 insertions(+), 104 deletions(-)
 create mode 100644 aom_dsp/flow_estimation/arm/disflow_neon.h
 create mode 100644 aom_dsp/flow_estimation/arm/disflow_sve.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index de987cbd23..27099d36b2 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -205,6 +205,9 @@ if(CONFIG_AV1_ENCODER)
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
                 "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SVE
+                "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_sve.c")
   endif()
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 7e746e9cb9..b75bdc5a19 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1799,7 +1799,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_compute_correlation sse4_1 avx2/;
 
     add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
-    specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon/;
+    specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon sve/;
   }
 
 }  # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c
index 62729133e3..5758d2887f 100644
--- a/aom_dsp/flow_estimation/arm/disflow_neon.c
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -16,36 +16,10 @@
 
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/flow_estimation/arm/disflow_neon.h"
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
-  // Check that the fractional position is in range.
-  //
-  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
-  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
-  // possible to have x == 1 due to floating point rounding. This is fine,
-  // and we still interpolate correctly if we allow x = 1.
-  assert(0 <= x && x <= 1);
-
-  double x2 = x * x;
-  double x3 = x2 * x;
-  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
-  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
-  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
-  kernel[3] = -0.5 * x2 + 0.5 * x3;
-}
-
-static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
-  double kernel_dbl[4];
-  get_cubic_kernel_dbl(x, kernel_dbl);
-
-  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
-  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
-  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
-  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
-}
-
 // Compare two regions of width x height pixels, one rooted at position
 // (x, y) in src and the other at (x + u, y + v) in ref.
 // This function returns the sum of squared pixel differences between
@@ -157,82 +131,6 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
   }
 }
 
-static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
-                                  int16_t *dst, int dst_stride) {
-  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
-
-  // Horizontal filter, using kernel {1, 0, -1}.
-  const uint8_t *src_start = src - 1 * src_stride - 1;
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
-    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
-    uint8x8_t s0 = vget_low_u8(s);
-    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
-
-    // Given that the kernel is {1, 0, -1} the convolution is a simple
-    // subtraction.
-    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
-
-    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
-  }
-
-  // Vertical filter, using kernel {1, 2, 1}.
-  // This kernel can be split into two 2-taps kernels of value {1, 1}.
-  // That way we need only 3 add operations to perform the convolution, one of
-  // which can be reused for the next line.
-  int16x8_t s0 = vld1q_s16(tmp);
-  int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
-  int16x8_t sum01 = vaddq_s16(s0, s1);
-  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
-    int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
-
-    int16x8_t sum12 = vaddq_s16(s1, s2);
-    int16x8_t sum = vaddq_s16(sum01, sum12);
-
-    vst1q_s16(dst + i * dst_stride, sum);
-
-    sum01 = sum12;
-    s1 = s2;
-  }
-}
-
-static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
-                                  int16_t *dst, int dst_stride) {
-  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
-
-  // Horizontal filter, using kernel {1, 2, 1}.
-  // This kernel can be split into two 2-taps kernels of value {1, 1}.
-  // That way we need only 3 add operations to perform the convolution.
-  const uint8_t *src_start = src - 1 * src_stride - 1;
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
-    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
-    uint8x8_t s0 = vget_low_u8(s);
-    uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
-    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
-
-    uint16x8_t sum01 = vaddl_u8(s0, s1);
-    uint16x8_t sum12 = vaddl_u8(s1, s2);
-    uint16x8_t sum = vaddq_u16(sum01, sum12);
-
-    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
-  }
-
-  // Vertical filter, using kernel {1, 0, -1}.
-  // Load the whole block at once to avoid redundant loads during convolution.
-  int16x8_t t[10];
-  load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
-                &t[5], &t[6], &t[7], &t[8], &t[9]);
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
-    // Given that the kernel is {1, 0, -1} the convolution is a simple
-    // subtraction.
-    int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
-
-    vst1q_s16(dst + i * dst_stride, diff);
-  }
-}
-
 // Computes the components of the system of equations used to solve for
 // a flow vector.
 //
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.h b/aom_dsp/flow_estimation/arm/disflow_neon.h
new file mode 100644
index 0000000000..d991a13460
--- /dev/null
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+  // Check that the fractional position is in range.
+  //
+  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
+  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+  // possible to have x == 1 due to floating point rounding. This is fine,
+  // and we still interpolate correctly if we allow x = 1.
+  assert(0 <= x && x <= 1);
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+  kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+  double kernel_dbl[4];
+  get_cubic_kernel_dbl(x, kernel_dbl);
+
+  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 0, -1}.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
+  }
+
+  // Vertical filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution, one of
+  // which can be reused for the next line.
+  int16x8_t s0 = vld1q_s16(tmp);
+  int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
+  int16x8_t sum01 = vaddq_s16(s0, s1);
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int16x8_t sum12 = vaddq_s16(s1, s2);
+    int16x8_t sum = vaddq_s16(sum01, sum12);
+
+    vst1q_s16(dst + i * dst_stride, sum);
+
+    sum01 = sum12;
+    s1 = s2;
+  }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    uint16x8_t sum01 = vaddl_u8(s0, s1);
+    uint16x8_t sum12 = vaddl_u8(s1, s2);
+    uint16x8_t sum = vaddq_u16(sum01, sum12);
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
+  }
+
+  // Vertical filter, using kernel {1, 0, -1}.
+  // Load the whole block at once to avoid redundant loads during convolution.
+  int16x8_t t[10];
+  load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
+                &t[5], &t[6], &t[7], &t[8], &t[9]);
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
+
+    vst1q_s16(dst + i * dst_stride, diff);
+  }
+}
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
diff --git a/aom_dsp/flow_estimation/arm/disflow_sve.c b/aom_dsp/flow_estimation/arm/disflow_sve.c
new file mode 100644
index 0000000000..7b01e90d12
--- /dev/null
+++ b/aom_dsp/flow_estimation/arm/disflow_sve.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <math.h>
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/flow_estimation/arm/disflow_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
+  0, 2, 4, 6, 1, 3, 5, 7,
+};
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+                                      int width, int height, int stride, int x,
+                                      int y, double u, double v, int16_t *dt) {
+  // Split offset into integer and fractional parts, and compute cubic
+  // interpolation kernels
+  const int u_int = (int)floor(u);
+  const int v_int = (int)floor(v);
+  const double u_frac = u - floor(u);
+  const double v_frac = v - floor(v);
+
+  int h_kernel[4];
+  int v_kernel[4];
+  get_cubic_kernel_int(u_frac, h_kernel);
+  get_cubic_kernel_int(v_frac, v_kernel);
+
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+
+  // Clamp coordinates so that all pixels we fetch will remain within the
+  // allocated border region, but allow them to go far enough out that
+  // the border pixels' values do not change.
+  // Since we are calculating an 8x8 block, the bottom-right pixel
+  // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+  // interpolation has 4 taps, meaning that the output of pixel
+  // (x_w, y_w) depends on the pixels in the range
+  // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+  //
+  // Thus the most extreme coordinates which will be fetched are
+  // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+  const int x0 = clamp(x + u_int, -9, width);
+  const int y0 = clamp(y + v_int, -9, height);
+
+  // Horizontal convolution.
+  const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1);
+  const int16x4_t h_kernel_s16 = vmovn_s32(vld1q_s32(h_kernel));
+  const int16x8_t h_filter = vcombine_s16(h_kernel_s16, vdup_n_s16(0));
+  const uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) {
+    svuint16_t r0 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 0);
+    svuint16_t r1 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 1);
+    svuint16_t r2 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 2);
+    svuint16_t r3 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 3);
+
+    int16x8_t s0 = vreinterpretq_s16_u16(svget_neonq_u16(r0));
+    int16x8_t s1 = vreinterpretq_s16_u16(svget_neonq_u16(r1));
+    int16x8_t s2 = vreinterpretq_s16_u16(svget_neonq_u16(r2));
+    int16x8_t s3 = vreinterpretq_s16_u16(svget_neonq_u16(r3));
+
+    int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s0, h_filter, 0);
+    int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s1, h_filter, 0);
+    int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s2, h_filter, 0);
+    int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s3, h_filter, 0);
+
+    int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+    int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+    // 6 is the maximum allowable number of extra bits which will avoid
+    // the intermediate values overflowing an int16_t. The most extreme
+    // intermediate value occurs when:
+    // * The input pixels are [0, 255, 255, 0]
+    // * u_frac = 0.5
+    // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+    // As an integer with 6 fractional bits, that is 18360, which fits
+    // in an int16_t. But with 7 fractional bits it would be 36720,
+    // which is too large.
+    int16x8_t res = vcombine_s16(vrshrn_n_s32(res0, DISFLOW_INTERP_BITS - 6),
+                                 vrshrn_n_s32(res1, DISFLOW_INTERP_BITS - 6));
+
+    res = aom_tbl_s16(res, idx);
+
+    vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, res);
+  }
+
+  // Vertical convolution.
+  int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel));
+  int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+    int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE);
+    int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3);
+
+    int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3);
+
+    uint8x8_t s = vld1_u8(src + (i + y) * stride + x);
+    int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3));
+
+    // This time, we have to round off the 6 extra bits which were kept
+    // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+    // of precision to match the scale of the dx and dy arrays.
+    sum_lo = vrshrq_n_s32(sum_lo,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    sum_hi = vrshrq_n_s32(sum_hi,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16));
+    int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16));
+    vst1q_s16(dt + i * DISFLOW_PATCH_SIZE,
+              vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi)));
+  }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+//    (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+//   u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+//   M = |sum(dx * dx)  sum(dx * dy)|
+//       |sum(dx * dy)  sum(dy * dy)|
+//
+//   b = |sum(dx * dt)|
+//       |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       double *M_inv) {
+  int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t x = vld1q_s16(dx + i * dx_stride);
+    int16x8_t y = vld1q_s16(dy + i * dy_stride);
+
+    sum[0] = aom_sdotq_s16(sum[0], x, x);
+    sum[1] = aom_sdotq_s16(sum[1], x, y);
+    sum[2] = aom_sdotq_s16(sum[2], y, y);
+  }
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[1], sum[2]);
+  int32x4_t res = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+  // Apply regularization
+  // We follow the standard regularization method of adding `k * I` before
+  // inverting. This ensures that the matrix will be invertible.
+  //
+  // Setting the regularization strength k to 1 seems to work well here, as
+  // typical values coming from the other equations are very large (1e5 to
+  // 1e6, with an upper limit of around 6e7, at the time of writing).
+  // It also preserves the property that all matrix values are whole numbers,
+  // which is convenient for integerized SIMD implementation.
+
+  double M0 = (double)vgetq_lane_s32(res, 0) + 1;
+  double M1 = (double)vgetq_lane_s32(res, 1);
+  double M2 = (double)vgetq_lane_s32(res, 2);
+  double M3 = (double)vgetq_lane_s32(res, 3) + 1;
+
+  // Invert matrix M.
+  double det = (M0 * M3) - (M1 * M2);
+  assert(det >= 1);
+  const double det_inv = 1 / det;
+
+  M_inv[0] = M3 * det_inv;
+  M_inv[1] = -M1 * det_inv;
+  M_inv[2] = -M2 * det_inv;
+  M_inv[3] = M0 * det_inv;
+}
+
+static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       int *b) {
+  int64x2_t b_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t dx16 = vld1q_s16(dx + i * dx_stride);
+    int16x8_t dy16 = vld1q_s16(dy + i * dy_stride);
+    int16x8_t dt16 = vld1q_s16(dt + i * dt_stride);
+
+    b_s64[0] = aom_sdotq_s16(b_s64[0], dx16, dt16);
+    b_s64[1] = aom_sdotq_s16(b_s64[1], dy16, dt16);
+  }
+
+  b_s64[0] = vpaddq_s64(b_s64[0], b_s64[1]);
+  vst1_s32(b, vmovn_s64(b_s64[0]));
+}
+
+void aom_compute_flow_at_point_sve(const uint8_t *src, const uint8_t *ref,
+                                   int x, int y, int width, int height,
+                                   int stride, double *u, double *v) {
+  double M_inv[4];
+  int b[2];
+  int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+  // Compute gradients within this patch
+  const uint8_t *src_patch = &src[y * stride + x];
+  sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+  sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+  compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv);
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
+    compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
+                        DISFLOW_PATCH_SIZE, b);
+
+    // Solve flow equations to find a better estimate for the flow vector
+    // at this point
+    const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+    const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+    *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+    *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+    if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+      // Stop iteration when we're close to convergence
+      break;
+    }
+  }
+}
diff --git a/test/disflow_test.cc b/test/disflow_test.cc
index 4f004480e2..bee9e1261c 100644
--- a/test/disflow_test.cc
+++ b/test/disflow_test.cc
@@ -124,4 +124,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest,
                          ::testing::Values(aom_compute_flow_at_point_neon));
 #endif
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(SVE, ComputeFlowTest,
+                         ::testing::Values(aom_compute_flow_at_point_sve));
+#endif
+
 }  // namespace
-- 
GitLab


From f969060afe8cd567133462b491238815d25c0359 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 8 Apr 2024 13:53:02 -0700
Subject: [PATCH 047/391] rtc-speed 11: Condition skip_cdef on source_variance

For speed 11 video mode, resoln <= VGA:
keep skip_cdef = 1 for speed 11, but condition the skip
logic on the block spatial variance. This improves visual
quality near low texture areas, like faces. To compensate
for the speed loss skip testing the NEARMV mode.

Update the comments on skip_cdef.

stats change for speed 11: avg/ovr/ssim, IC speedup:
rtc_derf: -3.86/-3.80/-6.06, -1.43
rtc (vga only): -3.06/-2.70/-5.79, -1.11

Change-Id: I1d51bd37c6914b4d9df630f95265171c7a53133a
---
 av1/encoder/nonrd_pickmode.c   | 10 +++++++---
 av1/encoder/partition_search.c | 20 +++++++++++++++-----
 av1/encoder/speed_features.c   |  2 +-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index fdc10ded57..08ecb8495a 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1886,14 +1886,17 @@ static AOM_INLINE int skip_mode_by_low_temp(
 
 static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
-    int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+    int extra_prune, unsigned int sse_zeromv_norm, int more_prune,
+    int skip_nearmv) {
   const unsigned int thresh_skip_golden = 500;
 
   if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
       mode == NEWMV)
     return 1;
 
-  if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+  if ((bsize == BLOCK_128X128 && mode == NEWMV) ||
+      (skip_nearmv && mode == NEARMV))
+    return 1;
 
   // Skip testing non-LAST if this flag is set.
   if (extra_prune) {
@@ -2478,7 +2481,8 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
   // properties.
   if (skip_mode_by_bsize_and_ref_frame(
           *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
-          sse_zeromv_norm, rt_sf->nonrd_aggressive_skip))
+          sse_zeromv_norm, rt_sf->nonrd_aggressive_skip,
+          rt_sf->increase_source_sad_thresh))
     return true;
 
   // Skip mode based on low temporal variance and souce sad.
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 61d49a23f2..30ea7d9140 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -2323,8 +2323,9 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
   }
   if (cpi->sf.rt_sf.skip_cdef_sb) {
     // cdef_strength is initialized to 1 which means skip_cdef, and is updated
-    // here. Check to see is skipping cdef is allowed.
-    // Always allow cdef_skip for seg_skip = 1.
+    // here. Check to see is skipping cdef is allowed. Never skip on slide/scene
+    // change, near a key frame, or when color sensitivity is set. Always allow
+    // cdef_skip for seg_skip = 1.
     const int allow_cdef_skipping =
         seg_skip ||
         (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
@@ -2338,8 +2339,16 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
     MB_MODE_INFO **mi_sb =
         cm->mi_params.mi_grid_base +
         get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
-    // Do not skip if intra or new mv is picked, or color sensitivity is set.
-    // Never skip on slide/scene change.
+    const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+    unsigned int thresh_spatial_var =
+        (cpi->oxcf.speed >= 11 && !is_720p_or_larger &&
+         cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
+            ? 400
+            : UINT_MAX;
+    // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or
+    // intra or new mv is picked, with possible conidition on spatial variance.
+    // For skip_cdef_sb >= 2: more aggressive mode to always skip unless
+    // allow_cdef_skipping is false and source_variance is non-zero.
     if (cpi->sf.rt_sf.skip_cdef_sb >= 2) {
       mi_sb[0]->cdef_strength =
           mi_sb[0]->cdef_strength &&
@@ -2347,7 +2356,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
     } else {
       mi_sb[0]->cdef_strength =
           mi_sb[0]->cdef_strength && allow_cdef_skipping &&
-          !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+          !(x->source_variance < thresh_spatial_var &&
+            (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV));
     }
     // Store in the pickmode context.
     ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 256b6fc9eb..9a00042520 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1461,7 +1461,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
   // for resolutions below 720p.
   if (speed >= 11 && !is_720p_or_larger &&
       cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
-    sf->rt_sf.skip_cdef_sb = 2;
+    sf->rt_sf.skip_cdef_sb = 1;
     sf->rt_sf.force_only_last_ref = 1;
     sf->rt_sf.selective_cdf_update = 1;
     sf->rt_sf.use_nonrd_filter_search = 0;
-- 
GitLab


From f1f1bf450f8995249ee393605d977c781e1037f8 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 8 Apr 2024 17:23:31 +0100
Subject: [PATCH 048/391] Simplify Armv8.4 DotProd correction constant
 computation

Simplify the computation of the Armv8.4 DotProd convolution
correction constant. Summing 128 * filter_tap[0,7] is always the same
as 128 * 128 since the filter taps always sum to 128.

Change-Id: Ie0191b764809963c2be8f5032e6196725e20f0d9
---
 aom_dsp/arm/aom_convolve8_neon_dotprod.c      |  6 +-
 .../arm/compound_convolve_neon_dotprod.c      | 55 +++++++++----------
 av1/common/arm/convolve_neon_dotprod.c        | 49 +++++++----------
 3 files changed, 47 insertions(+), 63 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index c82125ba17..9fd94cd21d 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -108,8 +108,7 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const int32x4_t correction = vdupq_n_s32(128 << FILTER_BITS);
   const uint8x16_t range_limit = vdupq_n_u8(128);
   uint8x16_t s0, s1, s2, s3;
 
@@ -263,8 +262,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const int32x4_t correction = vdupq_n_s32(128 << FILTER_BITS);
   const uint8x8_t range_limit = vdup_n_u8(128);
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
   int8x16x2_t samples_LUT;
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c
index 3aeffbb0e6..40befdf44e 100644
--- a/av1/common/arm/compound_convolve_neon_dotprod.c
+++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16_t *x_filter_ptr, const int im_h, int w) {
   const int bd = 8;
-  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
   // Dot product constants and other shims.
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
-  // Fold horiz_const into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
-                                           (1 << ((ROUND0_BITS - 1) - 1)));
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+  // - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Halve the total because we will halve the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   const uint8x16_t range_limit = vdupq_n_u8(128);
 
   const uint8_t *src_ptr = src;
@@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
 
   // Dot-product constants and other shims.
   const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
   // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Halve the total because we will halve the filter values.
   int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
+      vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+                   (1 << (ROUND0_BITS - 1))) /
+                  2);
 
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - horiz_offset;
@@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
 
   // Dot-product constants and other shims.
   const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
   // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Halve the total because we will halve the filter values.
   int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
+      vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+                   (1 << (ROUND0_BITS - 1))) /
+                  2);
 
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - horiz_offset;
@@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod(
 
   // Dot-product constants and other shims.
   const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
   // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Halve the total because we will halve the vilter values.
   int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
+      vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+                   (1 << (ROUND0_BITS - 1))) /
+                  2);
 
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - horiz_offset;
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index c29229eb09..132da2442b 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -102,14 +102,12 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   const int8x16_t filter =
       vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
 
-  const int32_t correction_s32 =
-      vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)),
-                           vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS))));
-  // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
-  // shift by FILTER_BITS - instead of a first rounding right shift by
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
   // ROUND0_BITS.
-  int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1)));
+  int32x4_t correction =
+      vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1)));
   const uint8x16_t range_limit = vdupq_n_u8(128);
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
@@ -274,16 +272,13 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
   }
 
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
-  // Dot product constants.
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
-  // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we will halve the filter values.
+  // Dot product constants:
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we will halve the filter values.
   const int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
   const uint8x16_t range_limit = vdupq_n_u8(128);
 
   if (w <= 4) {
@@ -465,16 +460,13 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
     const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
                                            vmovn_s16(x_filter_s16.val[1]));
 
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-    // - which are generally faster than rounding shifts on modern CPUs.
+    // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
     const int32_t horiz_const =
         ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
     // Dot product constants.
-    const int32x4_t correct_tmp =
-        vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
-                  vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
     const int32x4_t correction =
-        vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+        vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
     const uint8x16_t range_limit = vdupq_n_u8(128);
     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
@@ -621,16 +613,15 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   const int bd = 8;
-  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32_t horiz_const =
-      ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
   // Dot product constants.
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
-  const int32_t correction_s32 =
-      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
-  const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const);
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Halve the total because we will halve the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   const uint8x16_t range_limit = vdupq_n_u8(128);
 
   const uint8_t *src_ptr = src;
-- 
GitLab


From 24c1c6cd68a090e5bb83d90d642aaf67d4b772f9 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 9 Apr 2024 15:37:23 +0100
Subject: [PATCH 049/391] Refactor Arm Neon transpose_concat_*() to not need
 lookup table

Refactor the transpose_concat_*() helper function used in the Arm Neon
DotProd and I8MM vertical convolution implementations to not use TBL
instructions. Using vzip* to achieve the same outcome (with the same
number of instructions) avoids needing/loading the lookup indices and
also increases performance on little (in-order) Arm Cortex cores.

Change-Id: Ic118bdef3fce944ea6f02c85ec52ffc9c85908ea
---
 aom_dsp/arm/aom_convolve8_neon_dotprod.c | 114 +++++++++++------------
 aom_dsp/arm/aom_convolve8_neon_i8mm.c    | 114 +++++++++++------------
 2 files changed, 108 insertions(+), 120 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 9fd94cd21d..4a94626f26 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -30,11 +30,6 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   /* Shift left and insert new last column in transposed 4x4 block. */
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
@@ -176,46 +171,54 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
+                                        int8x8_t a3, int8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+  *b = vreinterpretq_s8_s16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+                                        int8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8x2_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+  *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+  *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
 static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
@@ -277,8 +280,6 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
@@ -296,10 +297,10 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
      * (see horizontal case) required before computing the dot product.
      */
     int8x16_t s0123, s1234, s2345, s3456;
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -311,7 +312,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
       int8x16_t s4567, s5678, s6789, s78910;
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -343,8 +344,6 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-
     do {
       int height = h;
       const uint8_t *s = src;
@@ -368,14 +367,10 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
        */
       int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
           s3456_lo, s3456_hi;
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t t7, t8, t9, t10;
@@ -388,8 +383,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
 
         int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
             s78910_lo, s78910_hi;
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
         /* Merge new data into block from previous iteration. */
         samples_LUT.val[0] = s3456_lo;
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index df6e4d2ab5..31b324aeb8 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -29,11 +29,6 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   /* Shift left and insert new last column in transposed 4x4 block. */
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
@@ -162,46 +157,54 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
+                                        uint8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+  *b = vreinterpretq_u8_u16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+                                        uint8x16_t *b0, uint8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8x2_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+  *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+  *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
 static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
@@ -257,8 +260,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-
     uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
@@ -267,17 +268,17 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
      * (see horizontal case) required before computing the dot product.
      */
     uint8x16_t s0123, s1234, s2345, s3456;
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t s7, s8, s9, s10;
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
 
       uint8x16_t s4567, s5678, s6789, s78910;
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -308,8 +309,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-
     do {
       int height = h;
       const uint8_t *s = src;
@@ -324,14 +323,10 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
        */
       uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
           s3456_lo, s3456_hi;
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t s7, s8, s9, s10;
@@ -339,8 +334,7 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
         uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
             s78910_lo, s78910_hi;
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
         /* Merge new data into block from previous iteration. */
         samples_LUT.val[0] = s3456_lo;
-- 
GitLab


From 60653dff7f8ee3e769a0aeec5e210a4fc2687717 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 4 Apr 2024 15:14:08 -0700
Subject: [PATCH 050/391] Avoid integer overflows in align_image_dimension()

Impose maximum values on the input parameters so that we can perform
arithmetic operations without worrying about overflows.

Fix a bug (introduced in commit 7aa2edc) that the ~ operator is applied
to (stride_align - 1), which is unsigned int, and then the result is
converted to uint64_t.

Also change the AomImageTest.AomImgAllocHugeWidth test to write to the
first and last samples in the first row of the Y plane, so that the test
will crash if there is unsigned integer overflow in the calculation of
stride_in_bytes.

Bug: chromium:332382766
Change-Id: I634c38c35a296b5bbf3de7ddf10040e7ec5ee9a1
---
 aom/aom_image.h        | 28 +++++++++++++++++++---------
 aom/src/aom_image.c    | 19 +++++++++++++++----
 test/aom_image_test.cc | 29 +++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/aom/aom_image.h b/aom/aom_image.h
index d5f0c087e6..bdbb053483 100644
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -244,10 +244,13 @@ typedef struct aom_image {
  *                         is NULL, the storage for the descriptor will be
  *                         allocated on the heap.
  * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
+ * \param[in]    d_w       Width of the image. Must not exceed 0x08000000
+ *                         (2^27).
+ * \param[in]    d_h       Height of the image. Must not exceed 0x08000000
+ *                         (2^27).
  * \param[in]    align     Alignment, in bytes, of the image buffer and
- *                         each row in the image (stride).
+ *                         each row in the image (stride). Must not exceed
+ *                         65536.
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
@@ -267,10 +270,12 @@ aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
  *                         is NULL, the storage for the descriptor will be
  *                         allocated on the heap.
  * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
+ * \param[in]    d_w       Width of the image. Must not exceed 0x08000000
+ *                         (2^27).
+ * \param[in]    d_h       Height of the image. Must not exceed 0x08000000
+ *                         (2^27).
  * \param[in]    align     Alignment, in bytes, of each row in the image
- *                         (stride).
+ *                         (stride). Must not exceed 65536.
  * \param[in]    img_data  Storage to use for the image
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
@@ -291,12 +296,17 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
  *                          is NULL, the storage for the descriptor will be
  *                          allocated on the heap.
  * \param[in]    fmt        Format for the image
- * \param[in]    d_w        Width of the image
- * \param[in]    d_h        Height of the image
+ * \param[in]    d_w        Width of the image. Must not exceed 0x08000000
+ *                          (2^27).
+ * \param[in]    d_h        Height of the image. Must not exceed 0x08000000
+ *                          (2^27).
  * \param[in]    align      Alignment, in bytes, of the image buffer and
- *                          each row in the image (stride).
+ *                          each row in the image (stride). Must not exceed
+ *                          65536.
  * \param[in]    size_align Alignment, in pixels, of the image width and height.
+ *                          Must not exceed 65536.
  * \param[in]    border     A border that is padded on four sides of the image.
+ *                          Must not exceed 65536.
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index b68dc4c8fd..e10b8a9ad1 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,12 +37,20 @@ static aom_image_t *img_alloc_helper(
   /* NOTE: In this function, bit_depth is either 8 or 16 (if
    * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
    */
-  unsigned int h, w, xcs, ycs, bps, bit_depth;
+  unsigned int xcs, ycs, bps, bit_depth;
 
   if (img != NULL) memset(img, 0, sizeof(aom_image_t));
 
   if (fmt == AOM_IMG_FMT_NONE) goto fail;
 
+  /* Impose maximum values on input parameters so that this function can
+   * perform arithmetic operations without worrying about overflows.
+   */
+  if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 ||
+      stride_align > 65536 || size_align > 65536 || border > 65536) {
+    goto fail;
+  }
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -104,11 +113,13 @@ static aom_image_t *img_alloc_helper(
   }
 
   /* Calculate storage sizes given the chroma subsampling */
-  w = align_image_dimension(d_w, xcs, size_align);
-  h = align_image_dimension(d_h, ycs, size_align);
+  const unsigned int w = align_image_dimension(d_w, xcs, size_align);
+  assert(d_w <= w);
+  const unsigned int h = align_image_dimension(d_h, ycs, size_align);
+  assert(d_h <= h);
 
   uint64_t s = (fmt & AOM_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / bit_depth;
-  s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
+  s = (s + 2 * border + stride_align - 1) & ~((uint64_t)stride_align - 1);
   s = s * bit_depth / 8;
   if (s > INT_MAX) goto fail;
   const int stride_in_bytes = (int)s;
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
index 62f3c12747..0dfb912215 100644
--- a/test/aom_image_test.cc
+++ b/test/aom_image_test.cc
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <climits>
+
 #include "aom/aom_image.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -81,6 +83,20 @@ TEST(AomImageTest, AomImgAllocHugeWidth) {
   image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1);
   ASSERT_EQ(image, nullptr);
 
+  // The aligned width (UINT_MAX + 1) would overflow unsigned int.
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, UINT_MAX, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  image = aom_img_alloc_with_border(nullptr, AOM_IMG_FMT_I422, 1, INT_MAX, 1,
+                                    0x40000000, 0);
+  if (image) {
+    uint16_t *y_plane =
+        reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]);
+    y_plane[0] = 0;
+    y_plane[image->d_w - 1] = 0;
+    aom_img_free(image);
+  }
+
   image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1);
   if (image) {
     aom_img_free(image);
@@ -101,8 +117,21 @@ TEST(AomImageTest, AomImgAllocHugeWidth) {
     aom_img_free(image);
   }
 
+  image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 65536, 2, 1);
+  if (image) {
+    uint16_t *y_plane =
+        reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]);
+    y_plane[0] = 0;
+    y_plane[image->d_w - 1] = 0;
+    aom_img_free(image);
+  }
+
   image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1);
   if (image) {
+    uint16_t *y_plane =
+        reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]);
+    y_plane[0] = 0;
+    y_plane[image->d_w - 1] = 0;
     aom_img_free(image);
   }
 }
-- 
GitLab


From a38ab61907e7619c6b100a769284505959def53e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 8 Apr 2024 19:00:12 -0700
Subject: [PATCH 051/391] Apply stride_align to byte count, not pixel count

stride_align is documented to be the "alignment, in bytes, of each row
in the image (stride)."

Change-Id: I4663f2fdf264800a0b8441772749920780248fbe
---
 aom/src/aom_image.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index e10b8a9ad1..09b6dd408e 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -119,8 +119,9 @@ static aom_image_t *img_alloc_helper(
   assert(d_h <= h);
 
   uint64_t s = (fmt & AOM_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / bit_depth;
-  s = (s + 2 * border + stride_align - 1) & ~((uint64_t)stride_align - 1);
+  s = s + 2 * border;
   s = s * bit_depth / 8;
+  s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1);
   if (s > INT_MAX) goto fail;
   const int stride_in_bytes = (int)s;
 
-- 
GitLab


From dac747efc95dac8d4f3ee480bac202fcc13d7213 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 28 Jun 2019 17:01:02 -0700
Subject: [PATCH 052/391] Add borders to w first, before all manipulations.

This allows the border samples to be multiplied by bps / bit_depth for
non-planar formats.

Change-Id: I4c7e2d4fd6e21049963bf21ee9e55ee6683dde30
---
 aom/src/aom_image.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 09b6dd408e..f8d4c78788 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -118,8 +118,8 @@ static aom_image_t *img_alloc_helper(
   const unsigned int h = align_image_dimension(d_h, ycs, size_align);
   assert(d_h <= h);
 
-  uint64_t s = (fmt & AOM_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / bit_depth;
-  s = s + 2 * border;
+  uint64_t s = (uint64_t)w + 2 * border;
+  s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth;
   s = s * bit_depth / 8;
   s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1);
   if (s > INT_MAX) goto fail;
-- 
GitLab


From 6b4246129daaec35a19a5854369403a49bfd7840 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 12 Apr 2024 15:06:40 -0700
Subject: [PATCH 053/391] Simplify conditions in aom_img_plane_width/height

Simplify the if conditions in aom_img_plane_width() and
aom_img_plane_height().

Change-Id: I81fa7484ca706be33142824365df4dcd65e3bc8b
---
 aom/src/aom_image.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index f8d4c78788..1d3b7df245 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -304,15 +304,15 @@ void aom_img_free(aom_image_t *img) {
 }
 
 int aom_img_plane_width(const aom_image_t *img, int plane) {
-  if (plane > 0 && img->x_chroma_shift > 0)
-    return (img->d_w + 1) >> img->x_chroma_shift;
+  if (plane > 0)
+    return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift;
   else
     return img->d_w;
 }
 
 int aom_img_plane_height(const aom_image_t *img, int plane) {
-  if (plane > 0 && img->y_chroma_shift > 0)
-    return (img->d_h + 1) >> img->y_chroma_shift;
+  if (plane > 0)
+    return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift;
   else
     return img->d_h;
 }
-- 
GitLab


From a4420e55a8d5da107001bb2bb61fa53263a90041 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 12 Apr 2024 15:10:47 -0700
Subject: [PATCH 054/391] Fix Doxygen comments for two enum typedefs

Fix the Doxygen comments for aom_transfer_characteristics_t and
aom_chroma_sample_position_t. Add a Doxygen comment for
aom_matrix_coefficients_t.

Change-Id: Ibcd298e9b5961f574de73052b3e0719842ed59dc
---
 aom/aom_image.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aom/aom_image.h b/aom/aom_image.h
index bdbb053483..68fb312222 100644
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -103,7 +103,8 @@ typedef enum aom_transfer_characteristics {
   AOM_CICP_TC_SMPTE_428 = 17,      /**< SMPTE ST 428 */
   AOM_CICP_TC_HLG = 18,            /**< BT.2100 HLG, ARIB STD-B67 */
   AOM_CICP_TC_RESERVED_19 = 19     /**< For future use (values 19-255) */
-} aom_transfer_characteristics_t;  /**< alias for enum aom_transfer_function */
+} aom_transfer_characteristics_t;  /**< alias for enum
+                                      aom_transfer_characteristics */
 
 /*!\brief List of supported matrix coefficients */
 typedef enum aom_matrix_coefficients {
@@ -125,7 +126,7 @@ typedef enum aom_matrix_coefficients {
   AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */
   AOM_CICP_MC_ICTCP = 14,      /**< BT.2100 ICtCp */
   AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255)  */
-} aom_matrix_coefficients_t;
+} aom_matrix_coefficients_t;   /**< alias for enum aom_matrix_coefficients */
 
 /*!\brief List of supported color range */
 typedef enum aom_color_range {
@@ -144,7 +145,8 @@ typedef enum aom_chroma_sample_position {
                                 /**< sample, between two vertical samples */
   AOM_CSP_COLOCATED = 2,        /**< Co-located with luma(0, 0) sample */
   AOM_CSP_RESERVED = 3          /**< Reserved value */
-} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
+} aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position
+                                 */
 
 /*!\brief List of insert flags for Metadata
  *
-- 
GitLab


From 9e633f0dcef31d86316c3e739fe2b9dbdfb2880d Mon Sep 17 00:00:00 2001
From: Samuthirika S <samuthirika.s@ittiam.com>
Date: Sun, 10 Mar 2024 11:31:57 +0530
Subject: [PATCH 055/391] Introduce av1_resize_plane_to_half() for Global
 Motion tool

Currently, the GM tool invokes av1_resize_plane() with a
downsample factor of exactly 2. To facilitate the SIMD for the
same, this CL introduces av1_resize_plane_to_half(), which
incorporates the necessary conditions from av1_resize_plane().
This is a bit-exact change with no impact on encode time.

Change-Id: I87ed23892221472477a209357cddd08919ad8edf
---
 aom_dsp/pyramid.c   | 31 +++++++++++++++++++++----
 av1/common/resize.c | 55 +++++++++++++++++++++++++++++++++++++++++++++
 av1/common/resize.h |  6 +++++
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index 5de001dbd5..05ddbb2f5f 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -305,6 +305,7 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
 
   // Fill in the remaining levels through progressive downsampling
   for (int level = already_filled_levels; level < n_levels; ++level) {
+    bool mem_status = false;
     PyramidLayer *prev_layer = &frame_pyr->layers[level - 1];
     uint8_t *prev_buffer = prev_layer->buffer;
     int prev_stride = prev_layer->stride;
@@ -315,6 +316,11 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
     int this_height = this_layer->height;
     int this_stride = this_layer->stride;
 
+    // The width and height of the previous layer that needs to be considered to
+    // derive the current layer frame.
+    const int input_layer_width = this_width << 1;
+    const int input_layer_height = this_height << 1;
+
     // Compute the this pyramid level by downsampling the current level.
     //
     // We downsample by a factor of exactly 2, clipping the rightmost and
@@ -329,13 +335,30 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
     // 2) Up/downsampling by a factor of 2 can be implemented much more
     //    efficiently than up/downsampling by a generic ratio.
     //    TODO(rachelbarker): Use optimized downsample-by-2 function
-    if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
-                          prev_stride, this_buffer, this_height, this_width,
-                          this_stride)) {
-      // If we can't allocate memory, we'll have to terminate early
+
+    // SIMD support has been added specifically for cases where the downsample
+    // factor is exactly 2. In such instances, horizontal and vertical resizing
+    // is performed utilizing the down2_symeven() function, which considers the
+    // even dimensions of the input layer.
+    if (should_resize_by_half(input_layer_height, input_layer_width,
+                              this_height, this_width)) {
+      assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 &&
+             "Input width or height cannot be odd.");
+      mem_status = av1_resize_plane_to_half(
+          prev_buffer, input_layer_height, input_layer_width, prev_stride,
+          this_buffer, this_height, this_width, this_stride);
+    } else {
+      mem_status = av1_resize_plane(prev_buffer, input_layer_height,
+                                    input_layer_width, prev_stride, this_buffer,
+                                    this_height, this_width, this_stride);
+    }
+
+    // Terminate early in cases of memory allocation failure.
+    if (!mem_status) {
       frame_pyr->filled_levels = n_levels;
       return -1;
     }
+
     fill_border(this_buffer, this_width, this_height, this_stride);
   }
 
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 441323ab1f..ef35fa2272 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -524,6 +524,61 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
+static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
+                                   int out_stride, int height, int height2,
+                                   int width2) {
+  bool mem_status = true;
+  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
+  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
+  if (arrbuf == NULL || arrbuf2 == NULL) {
+    mem_status = false;
+    goto Error;
+  }
+
+  for (int i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    down2_symeven(arrbuf, height, arrbuf2);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+  return mem_status;
+}
+
+static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
+                                   uint8_t *intbuf, int height,
+                                   int filtered_length, int width2) {
+  for (int i = 0; i < height; ++i)
+    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+}
+
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+                              int in_stride, uint8_t *output, int height2,
+                              int width2, int out_stride) {
+  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height);
+  if (intbuf == NULL) {
+    return false;
+  }
+
+  // Resize in the horizontal direction
+  resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+  // Resize in the vertical direction
+  bool mem_status =
+      resize_vert_dir(intbuf, output, out_stride, height, height2, width2);
+  aom_free(intbuf);
+  return mem_status;
+}
+
+// Check if both the output width and height are half of input width and
+// height respectively.
+bool should_resize_by_half(int height, int width, int height2, int width2) {
+  const bool is_width_by_2 = get_down2_length(width, 1) == width2;
+  const bool is_height_by_2 = get_down2_length(height, 1) == height2;
+  return (is_width_by_2 && is_height_by_2);
+}
+
 bool av1_resize_plane(const uint8_t *input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride) {
diff --git a/av1/common/resize.h b/av1/common/resize.h
index d573a538bf..6e7d46e0de 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -93,6 +93,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool,
                           bool alloc_pyramid);
 
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+                              int in_stride, uint8_t *output, int height2,
+                              int width2, int out_stride);
+
+bool should_resize_by_half(int height, int width, int height2, int width2);
+
 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
 static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
   // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
-- 
GitLab


From 60cbd98d3acc7a8d8eec9737917a3a44e9f78e9a Mon Sep 17 00:00:00 2001
From: Samuthirika S <samuthirika.s@ittiam.com>
Date: Sat, 13 Apr 2024 15:45:35 +0530
Subject: [PATCH 056/391] Add AVX2 for resize_vert_dir()

This CL adds AVX2 implementation for resize_vert_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160               9.28x
 2560x1440              10.35x
 1920x1080               9.28x
 1280x720                9.02x
 640x480                 9.16x
 640x360                 8.91x
 256x256                10.82x

This is a bit-exact change.

Change-Id: Ifb0677d5f99cf03154e6412aca5effd1eb267dc7
---
 av1/av1.cmake                |   1 +
 av1/common/av1_rtcd_defs.pl  |   3 +
 av1/common/resize.c          |  21 +-
 av1/common/resize.h          |   4 +
 av1/common/x86/resize_avx2.c | 411 +++++++++++++++++++++++++++++++++++
 test/frame_resize_test.cc    | 157 +++++++++++++
 test/test.cmake              |   1 +
 7 files changed, 585 insertions(+), 13 deletions(-)
 create mode 100644 av1/common/x86/resize_avx2.c
 create mode 100644 test/frame_resize_test.cc

diff --git a/av1/av1.cmake b/av1/av1.cmake
index c1206e9d68..b6cf974aa7 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -302,6 +302,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/resize_avx2.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
             "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8bf4b0709c..e97f39145d 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -554,6 +554,9 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
+add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/resize_vert_dir avx2/;
+
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
 
diff --git a/av1/common/resize.c b/av1/common/resize.c
index ef35fa2272..2b48b9fff4 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -18,6 +18,7 @@
 #include <string.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/flow_estimation/corner_detect.h"
@@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[(
 // Filters for interpolation (full-band) - no filtering for integer pixels
 #define filteredinterp_filters1000 av1_resize_filter_normative
 
-// Filters for factor of 2 downsampling.
-static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
-static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
-
 static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
   int out_length16 = out_length * 16;
   if (out_length16 >= in_length * 16)
@@ -524,9 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
-static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
-                                   int out_stride, int height, int height2,
-                                   int width2) {
+bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+                       int height, int height2, int width2, int start_col) {
   bool mem_status = true;
   uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
   uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@@ -535,7 +531,7 @@ static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
     goto Error;
   }
 
-  for (int i = 0; i < width2; ++i) {
+  for (int i = start_col; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
     down2_symeven(arrbuf, height, arrbuf2);
     fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
@@ -547,9 +543,8 @@ Error:
   return mem_status;
 }
 
-static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
-                                   uint8_t *intbuf, int height,
-                                   int filtered_length, int width2) {
+void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
+                     int height, int filtered_length, int width2) {
   for (int i = 0; i < height; ++i)
     down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
 }
@@ -565,8 +560,8 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
   // Resize in the horizontal direction
   resize_horz_dir(input, in_stride, intbuf, height, width, width2);
   // Resize in the vertical direction
-  bool mem_status =
-      resize_vert_dir(intbuf, output, out_stride, height, height2, width2);
+  bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
+                                    width2, 0 /*start_col*/);
   aom_free(intbuf);
   return mem_status;
 }
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 6e7d46e0de..de71f5d539 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -20,6 +20,10 @@
 extern "C" {
 #endif
 
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
 bool av1_resize_plane(const uint8_t *input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride);
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
new file mode 100644
index 0000000000..c44edb88d9
--- /dev/null
+++ b/av1/common/x86/resize_avx2.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define CAST_HI(x) _mm256_castsi128_si256(x)
+#define CAST_LOW(x) _mm256_castsi256_si128(x)
+
+#define PROCESS_RESIZE_Y_WD16                                               \
+  const int idx1 = AOMMIN(height - 1, i + 5);                               \
+  const int idx2 = AOMMIN(height - 1, i + 6);                               \
+  l6 = l10;                                                                 \
+  l7 = l11;                                                                 \
+  l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride));                  \
+  l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride));                  \
+                                                                            \
+  /* g0... g15 | i0... i15 */                                               \
+  const __m256i s68 =                                                       \
+      _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20);            \
+  /* h0... h15 | j0... j15 */                                               \
+  const __m256i s79 =                                                       \
+      _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20);            \
+                                                                            \
+  /* g0h0... g7g7 | i0j0... i7j */                                          \
+  s[3] = _mm256_unpacklo_epi8(s68, s79);                                    \
+  /* g8h8... g15g15 | i8j8... i15j15 */                                     \
+  s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
+                                                                            \
+  __m256i res_out[2] = { 0 };                                               \
+  resize_y_convolve(s, coeffs_y, res_out);                                  \
+                                                                            \
+  /* r00... r07 */                                                          \
+  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
+  /* r20... r27 */                                                          \
+  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits);   \
+                                                                            \
+  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);        \
+  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
+                                                                            \
+  __m256i res_out_b[2] = { 0 };                                             \
+  resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
+                                                                            \
+  /* r08... r015 */                                                         \
+  __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
+  /* r28... r215 */                                                         \
+  __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
+  res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits);        \
+  res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits);        \
+                                                                            \
+  /* r00... r03 r20... r23 | r04... r07 r24... r27 */                       \
+  __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);    \
+  /* r08... r012 r28... r212 | r013... r015 r213... r215 */                 \
+  __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2);    \
+  /* r00... r07 | r20... r27 */                                             \
+  res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8);                    \
+  /* r08... r015 | r28... r215 */                                           \
+  res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8);                    \
+  /* r00... r015 | r20... r215 */                                           \
+  res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1);                    \
+  res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel);                       \
+  res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
+
+#define PROCESS_RESIZE_Y_WD8                                              \
+  const int idx1 = AOMMIN(height - 1, i + 5);                             \
+  const int idx2 = AOMMIN(height - 1, i + 6);                             \
+  l6 = l10;                                                               \
+  l7 = l11;                                                               \
+  l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride));                \
+  l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride));                \
+                                                                          \
+  /* g0h0... g7h7 */                                                      \
+  s67 = _mm_unpacklo_epi8(l6, l7);                                        \
+  /* i0j0...i7j7 */                                                       \
+  __m128i s89 = _mm_unpacklo_epi8(l8, l9);                                \
+                                                                          \
+  /* g0h0...g7g7 | i0j0...i7j7 */                                         \
+  s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
+                                                                          \
+  __m256i res_out[2] = { 0 };                                             \
+  resize_y_convolve(s, coeffs_y, res_out);                                \
+                                                                          \
+  /* r00... r07 */                                                        \
+  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
+  /* r20...r27 */                                                         \
+  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
+  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);      \
+  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);      \
+                                                                          \
+  /* r00...r03 r20...r23 | r04...r07 r24...r27 */                         \
+  res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);      \
+  /* r00...r07 | r20...r27 */                                             \
+  res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8);          \
+  res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1);      \
+  res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
+  res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
+
+static INLINE void resize_y_convolve(const __m256i *const s,
+                                     const __m256i *const coeffs,
+                                     __m256i *res_out) {
+  const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
+  const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
+  // The sum of convolve operation crosses signed 16bit. Hence, the addition
+  // should happen in 32bit.
+  const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
+  const __m256i dst_01 =
+      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
+  const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
+  const __m256i dst_11 =
+      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
+
+  res_out[0] = _mm256_add_epi32(dst_00, dst_10);
+  res_out[1] = _mm256_add_epi32(dst_01, dst_11);
+}
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+                                         __m256i *const coeffs /* [4] */) {
+  // f0 f1 f2 f3 x x x x
+  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+  // f0 f1 f2 f3 f0 f1 f2 f3
+  const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
+  // f0 f1 f2 f3 f1 f0 f3 f2
+  const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
+
+  const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
+
+  // f0 f1 f0 f1 ..
+  coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
+  // f2 f3 f2 f3 ..
+  coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
+  // f3 f2 f3 f2 ..
+  coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
+  // f1 f0 f1 f0 ..
+  coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
+}
+
+bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                          int height, int height2, int stride, int start_col) {
+  assert(start_col <= stride);
+  // For the GM tool, the input layer height or width is assured to be an even
+  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+  // optimization of the same is not implemented.
+  // When the input height is less than 8 and even, the potential input
+  // heights are limited to 2, 4, or 6. These scenarios require seperate
+  // handling due to padding requirements. Invoking the C function here will
+  // eliminate the need for conditional statements within the subsequent SIMD
+  // code to manage these cases.
+  if (height & 1 || height < 8) {
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, start_col);
+  }
+
+  __m256i s[10], coeffs_y[4];
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const uint8_t max_pixel = 255;
+  const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+  const int num_col16 = stride / 16;
+  int remain_col = stride % 16;
+  // The core vertical SIMD processes 4 input rows simultaneously to generate
+  // output corresponding to 2 rows. To streamline the core loop and eliminate
+  // the need for conditional checks, the remaining rows (4 or 6) are processed
+  // separately.
+  const int remain_row = (height % 4 == 0) ? 4 : 6;
+
+  for (int j = start_col; j < stride - remain_col; j += 16) {
+    const uint8_t *data = &intbuf[j];
+    const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with the last available row at the top.
+    const __m128i l0 = l3;
+    const __m128i l1 = l3;
+    const __m128i l2 = l3;
+    const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
+
+    __m128i l6, l7, l8, l9;
+    __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
+    __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
+    __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
+
+    // a0...a15 | c0...c15
+    const __m256i s02 =
+        _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
+    // b0...b15 | d0...d15
+    const __m256i s13 =
+        _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
+    // c0...c15 | e0...e15
+    const __m256i s24 =
+        _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
+    // d0...d15 | f0...f15
+    const __m256i s35 =
+        _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
+    // e0...e15 | g0...g15
+    const __m256i s46 =
+        _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
+    // f0...f15 | h0...h15
+    const __m256i s57 =
+        _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
+
+    // a0b0...a7b7 | c0d0...c7d7
+    s[0] = _mm256_unpacklo_epi8(s02, s13);
+    // c0d0...c7d7 | e0f0...e7f7
+    s[1] = _mm256_unpacklo_epi8(s24, s35);
+    // e0f0...e7f7 | g0h0...g7h7
+    s[2] = _mm256_unpacklo_epi8(s46, s57);
+
+    // a8b8...a15b15 | c8d8...c15d15
+    s[5] = _mm256_unpackhi_epi8(s02, s13);
+    // c8d8...c15d15 | e8f8...e15f15
+    s[6] = _mm256_unpackhi_epi8(s24, s35);
+    // e8f8...e15f15 | g8h8...g15h15
+    s[7] = _mm256_unpackhi_epi8(s46, s57);
+
+    // height to be processed here
+    const int process_ht = height - remain_row;
+    for (int i = 0; i < process_ht; i += 4) {
+      PROCESS_RESIZE_Y_WD16
+
+      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                       CAST_LOW(res_8bit0));
+
+      _mm_storeu_si128(
+          (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
+          _mm256_extracti128_si256(res_8bit0, 1));
+
+      // Load the required data for processing of next 4 input rows.
+      const int idx7 = AOMMIN(height - 1, i + 7);
+      const int idx8 = AOMMIN(height - 1, i + 8);
+      l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
+      l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
+
+      const __m256i s810 =
+          _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+      const __m256i s911 =
+          _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+      // i0j0... i7j7 | k0l0... k7l7
+      s[4] = _mm256_unpacklo_epi8(s810, s911);
+      // i8j8... i15j15 | k8l8... k15l15
+      s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+
+      s[5] = s[7];
+      s[6] = s[8];
+      s[7] = s[9];
+    }
+
+    // Process the remaining last 4 or 6 rows here.
+    int i = process_ht;
+    while (i < height - 1) {
+      PROCESS_RESIZE_Y_WD16
+
+      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                       CAST_LOW(res_8bit0));
+      i += 2;
+
+      const int is_store_valid = (i < height - 1);
+      if (is_store_valid)
+        _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                         _mm256_extracti128_si256(res_8bit0, 1));
+      i += 2;
+
+      // Check if there is any remaining height to process. If so, perform the
+      // necessary data loading for processing the next row.
+      if (i < height - 1) {
+        l10 = l11 = l9;
+        const __m256i s810 =
+            _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+        const __m256i s911 =
+            _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+        // i0j0... i7j7 | k0l0... k7l7
+        s[4] = _mm256_unpacklo_epi8(s810, s911);
+        // i8j8... i15j15 | k8l8... k15l15
+        s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+
+        s[5] = s[7];
+        s[6] = s[8];
+        s[7] = s[9];
+      }
+    }
+  }
+
+  if (remain_col > 7) {
+    const int processed_wd = num_col16 * 16;
+    remain_col = stride % 8;
+
+    const uint8_t *data = &intbuf[processed_wd];
+
+    const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with available top-most row.
+    const __m128i l0 = l3;
+    const __m128i l1 = l3;
+    const __m128i l2 = l3;
+    const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+
+    __m128i l6, l7, l8, l9;
+    __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+    __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
+    __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
+
+    // a0b0...a7b7
+    const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
+    // c0d0...c7d7
+    const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
+    // e0f0...e7f7
+    const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
+    // g0h0...g7h7
+    __m128i s67 = _mm_unpacklo_epi8(l10, l11);
+
+    // a0b0...a7b7 | c0d0...c7d7
+    s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
+    // c0d0...c7d7 | e0f0...e7f7
+    s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
+    // e0f0...e7f7 | g0h0...g7h7
+    s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
+
+    // height to be processed here
+    const int process_ht = height - remain_row;
+    for (int i = 0; i < process_ht; i += 4) {
+      PROCESS_RESIZE_Y_WD8
+
+      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+                       CAST_LOW(res_a_round_1));
+
+      _mm_storel_epi64(
+          (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
+          _mm256_extracti128_si256(res_a_round_1, 1));
+
+      const int idx7 = AOMMIN(height - 1, i + 7);
+      const int idx8 = AOMMIN(height - 1, i + 8);
+      l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
+      l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
+
+      // k0l0... k7l7
+      const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+      // i0j0... i7j7 | k0l0... k7l7
+      s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+    }
+
+    // Process the remaining last 4 or 6 rows here.
+    int i = process_ht;
+    while (i < height - 1) {
+      PROCESS_RESIZE_Y_WD8
+
+      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+                       CAST_LOW(res_a_round_1));
+
+      i += 2;
+
+      const int is_store_valid = (i < height - 1);
+      if (is_store_valid)
+        _mm_storel_epi64(
+            (__m128i *)&output[(i / 2) * out_stride + processed_wd],
+            _mm256_extracti128_si256(res_a_round_1, 1));
+      i += 2;
+
+      // Check rows are still remaining for processing. If yes do the required
+      // load of data for the next iteration.
+      if (i < height - 1) {
+        l10 = l11 = l9;
+        // k0l0... k7l7
+        const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+        // i0j0... i7j7 | k0l0... k7l7
+        s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+      }
+    }
+  }
+
+  if (remain_col)
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, stride - remain_col);
+
+  return true;
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
new file mode 100644
index 0000000000..8891304192
--- /dev/null
+++ b/test/frame_resize_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/bitops.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+using std::make_tuple;
+using std::tuple;
+
+const int kIters = 1000;
+
+typedef tuple<int, int> FrameDimension;
+
+// Resolutions (width x height) to be tested for resizing.
+const FrameDimension kFrameDim[] = {
+  make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
+  make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
+  make_tuple(256, 256),
+};
+
+// Check that two 8-bit output buffers are identical.
+void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
+                          int height) {
+  ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
+  for (int j = 0; j < height; ++j) {
+    if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+      p1 += width;
+      p2 += width;
+      continue;
+    }
+    for (int i = 0; i < width; ++i) {
+      ASSERT_EQ(p1[i], p2[i])
+          << width << "x" << height << " Pixel mismatch at (" << i << ", " << j
+          << ")";
+    }
+  }
+}
+
+typedef bool (*LowBDResizeFunc)(uint8_t *intbuf, uint8_t *output,
+                                int out_stride, int height, int height2,
+                                int stride, int start_wd);
+// Test parameter list:
+//  <tst_fun, dims>
+typedef tuple<LowBDResizeFunc, FrameDimension> ResizeTestParams;
+
+class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
+ public:
+  void SetUp() {
+    test_fun_ = GET_PARAM(0);
+    frame_dim_ = GET_PARAM(1);
+    width_ = std::get<0>(frame_dim_);
+    height_ = std::get<1>(frame_dim_);
+    const int msb = get_msb(AOMMIN(width_, height_));
+    n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+
+    src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_));
+    ref_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_));
+    test_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_));
+  }
+
+  void RunTest() {
+    int width2 = width_, height2 = height_;
+
+    for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
+    for (int level = 1; level < n_levels_; level++) {
+      width2 = (width_ >> level);
+      height2 = (height_ >> level);
+      resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2,
+                        0);
+      test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+
+      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
+    }
+  }
+
+  void SpeedTest() {
+    int width2 = width_, height2 = height_;
+
+    for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
+    for (int level = 1; level < n_levels_; level++) {
+      width2 = (width_ >> level);
+      height2 = (height_ >> level);
+      aom_usec_timer ref_timer;
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < kIters; j++) {
+        resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
+                          width2, 0);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+      aom_usec_timer tst_timer;
+      aom_usec_timer_start(&tst_timer);
+      for (int j = 0; j < kIters; j++) {
+        test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+      }
+      aom_usec_timer_mark(&tst_timer);
+      const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+      std::cout << "level: " << level << " [" << width2 << " x " << height2
+                << "] C time = " << ref_time << " , SIMD time = " << tst_time
+                << " scaling=" << float(1.00) * ref_time / tst_time << "x \n";
+    }
+  }
+
+  void TearDown() {
+    aom_free(src_);
+    aom_free(ref_dest_);
+    aom_free(test_dest_);
+  }
+
+ private:
+  LowBDResizeFunc test_fun_;
+  FrameDimension frame_dim_;
+  int width_;
+  int height_;
+  int n_levels_;
+  uint8_t *src_;
+  uint8_t *ref_dest_;
+  uint8_t *test_dest_;
+  libaom_test::ACMRandom rng_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeYTest);
+
+TEST_P(AV1ResizeYTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ResizeYTest,
+    ::testing::Combine(::testing::Values(resize_vert_dir_avx2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index e2f5da570d..2631c9fb39 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -209,6 +209,7 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
               "${AOM_ROOT}/test/firstpass_test.cc"
+              "${AOM_ROOT}/test/frame_resize_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
               "${AOM_ROOT}/test/horver_correlation_test.cc"
-- 
GitLab


From 860bdd3a127ec9d26b0dff804e06dcbbc2dc299c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 9 Apr 2024 17:01:23 +0100
Subject: [PATCH 057/391] Add intermediate buffers for av1_compute_stats_highbd

The standard bitdepth version of av1_compute_stats makes use of
intermediate buffers dgd_avg and src_avg to precompute the subtraction
of the average. Port this optimization to the highbd version in
preparation for those buffers being used in a subsequent commit. For now
these buffers are never allocated for high bidepth.

Also remove the allocation for Neon, as the implementation doesn't make
use of them.

Change-Id: I3b646bafaf43013d508f343b318897a83c519410
---
 av1/common/av1_rtcd_defs.pl                |  2 +-
 av1/encoder/arm/neon/highbd_pickrst_neon.c |  5 +-
 av1/encoder/pickrst.c                      | 21 +++++----
 av1/encoder/x86/pickrst_avx2.c             | 12 +++--
 av1/encoder/x86/pickrst_sse4.c             | 18 ++++---
 test/wiener_test.cc                        | 55 ++++++++++++++--------
 6 files changed, 74 insertions(+), 39 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e97f39145d..6a0043c761 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -469,7 +469,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/;
       add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
       specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
-      add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+      add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
       specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
     }
   }
diff --git a/av1/encoder/arm/neon/highbd_pickrst_neon.c b/av1/encoder/arm/neon/highbd_pickrst_neon.c
index 47b5f5cfb7..8b0d3bcc7e 100644
--- a/av1/encoder/arm/neon/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -1008,10 +1008,13 @@ static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride,
 }
 
 void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
-                                   const uint8_t *src8, int h_start, int h_end,
+                                   const uint8_t *src8, int16_t *dgd_avg,
+                                   int16_t *src_avg, int h_start, int h_end,
                                    int v_start, int v_end, int dgd_stride,
                                    int src_stride, int64_t *M, int64_t *H,
                                    aom_bit_depth_t bit_depth) {
+  (void)dgd_avg;
+  (void)src_avg;
   assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
 
   const int wiener_halfwin = wiener_win >> 1;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index b0d0d0bb78..a431c4dada 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1044,10 +1044,13 @@ void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
-                                const uint8_t *src8, int h_start, int h_end,
+                                const uint8_t *src8, int16_t *dgd_avg,
+                                int16_t *src_avg, int h_start, int h_end,
                                 int v_start, int v_end, int dgd_stride,
                                 int src_stride, int64_t *M, int64_t *H,
                                 aom_bit_depth_t bit_depth) {
+  (void)dgd_avg;
+  (void)src_avg;
   int i, j, k, l;
   int32_t Y[WIENER_WIN2];
   const int wiener_win2 = wiener_win * wiener_win;
@@ -1659,9 +1662,10 @@ static AOM_INLINE void search_wiener(
     // functions. Optimize intrinsics of HBD design similar to LBD (i.e.,
     // pre-calculate d and s buffers and avoid most of the C operations).
     av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
-                             rsc->src_buffer, limits->h_start, limits->h_end,
-                             limits->v_start, limits->v_end, rsc->dgd_stride,
-                             rsc->src_stride, M, H, cm->seq_params->bit_depth);
+                             rsc->src_buffer, rsc->dgd_avg, rsc->src_avg,
+                             limits->h_start, limits->h_end, limits->v_start,
+                             limits->v_end, rsc->dgd_stride, rsc->src_stride, M,
+                             H, cm->seq_params->bit_depth);
   } else {
     av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                       rsc->dgd_avg, rsc->src_avg, limits->h_start,
@@ -2081,10 +2085,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   // and height aligned to multiple of 16 is considered for intrinsic purpose.
   rsc.dgd_avg = NULL;
   rsc.src_avg = NULL;
-#if HAVE_AVX2 || HAVE_NEON
-  // The buffers allocated below are used during Wiener filter processing of low
-  // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
-  // low bitdepth path.
+#if HAVE_AVX2
+  // The buffers allocated below are used during Wiener filter processing.
+  // Hence, allocate the same when Wiener filter is enabled.
   if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
     const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
                          RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
@@ -2221,7 +2224,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
                               best_luma_unit_size);
   }
 
-#if HAVE_AVX || HAVE_NEON
+#if HAVE_AVX2
   if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
     aom_free(cpi->pick_lr_ctxt.dgd_avg);
     cpi->pick_lr_ctxt.dgd_avg = NULL;
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 6658ed39a8..1f76576c9e 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -345,21 +345,27 @@ static INLINE void compute_stats_highbd_win5_opt_avx2(
 }
 
 void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
-                                   const uint8_t *src8, int h_start, int h_end,
+                                   const uint8_t *src8, int16_t *dgd_avg,
+                                   int16_t *src_avg, int h_start, int h_end,
                                    int v_start, int v_end, int dgd_stride,
                                    int src_stride, int64_t *M, int64_t *H,
                                    aom_bit_depth_t bit_depth) {
   if (wiener_win == WIENER_WIN) {
+    (void)dgd_avg;
+    (void)src_avg;
     compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
                                        v_end, dgd_stride, src_stride, M, H,
                                        bit_depth);
   } else if (wiener_win == WIENER_WIN_CHROMA) {
+    (void)dgd_avg;
+    (void)src_avg;
     compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
                                        v_end, dgd_stride, src_stride, M, H,
                                        bit_depth);
   } else {
-    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
-                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+                               h_start, h_end, v_start, v_end, dgd_stride,
+                               src_stride, M, H, bit_depth);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 50db305802..3617d33fef 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -524,21 +524,27 @@ static INLINE void compute_stats_highbd_win5_opt_sse4_1(
 }
 
 void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
-                                     const uint8_t *src8, int h_start,
-                                     int h_end, int v_start, int v_end,
-                                     int dgd_stride, int src_stride, int64_t *M,
-                                     int64_t *H, aom_bit_depth_t bit_depth) {
+                                     const uint8_t *src8, int16_t *dgd_avg,
+                                     int16_t *src_avg, int h_start, int h_end,
+                                     int v_start, int v_end, int dgd_stride,
+                                     int src_stride, int64_t *M, int64_t *H,
+                                     aom_bit_depth_t bit_depth) {
   if (wiener_win == WIENER_WIN) {
+    (void)dgd_avg;
+    (void)src_avg;
     compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
                                          v_end, dgd_stride, src_stride, M, H,
                                          bit_depth);
   } else if (wiener_win == WIENER_WIN_CHROMA) {
+    (void)dgd_avg;
+    (void)src_avg;
     compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
                                          v_end, dgd_stride, src_stride, M, H,
                                          bit_depth);
   } else {
-    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
-                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+                               h_start, h_end, v_start, v_end, dgd_stride,
+                               src_stride, M, H, bit_depth);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 2886ed77df..c38e10e3c2 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -520,25 +520,27 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
 }
 
 void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd,
-                                const uint8_t *src, int h_start, int h_end,
-                                int v_start, int v_end, int dgd_stride,
-                                int src_stride, int64_t *M, int64_t *H,
-                                aom_bit_depth_t bit_depth) {
+                                const uint8_t *src, int16_t *d, int16_t *s,
+                                int h_start, int h_end, int v_start, int v_end,
+                                int dgd_stride, int src_stride, int64_t *M,
+                                int64_t *H, aom_bit_depth_t bit_depth) {
   if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end,
                                    v_start, v_end, dgd_stride, src_stride, M, H,
                                    bit_depth);
   } else {
-    av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start,
-                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+    av1_compute_stats_highbd_c(wiener_win, dgd, src, d, s, h_start, h_end,
+                               v_start, v_end, dgd_stride, src_stride, M, H,
+                               bit_depth);
   }
 }
 
 static const int kIterations = 100;
 typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
-                                   const uint8_t *src, int h_start, int h_end,
-                                   int v_start, int v_end, int dgd_stride,
-                                   int src_stride, int64_t *M, int64_t *H,
+                                   const uint8_t *src, int16_t *d, int16_t *s,
+                                   int h_start, int h_end, int v_start,
+                                   int v_end, int dgd_stride, int src_stride,
+                                   int64_t *M, int64_t *H,
                                    aom_bit_depth_t bit_depth);
 
 typedef std::tuple<const compute_stats_Func> WienerTestParam;
@@ -552,11 +554,17 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
     dgd_buf = (uint16_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
     ASSERT_NE(dgd_buf, nullptr);
+    const size_t buf_size =
+        sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+    buf = (int16_t *)aom_memalign(32, buf_size);
+    ASSERT_NE(buf, nullptr);
+    memset(buf, 0, buf_size);
     target_func_ = GET_PARAM(0);
   }
   void TearDown() override {
     aom_free(src_buf);
     aom_free(dgd_buf);
+    aom_free(buf);
   }
   void RunWienerTest(const int32_t wiener_win, int32_t run_times,
                      aom_bit_depth_t bit_depth);
@@ -568,6 +576,7 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
   libaom_test::ACMRandom rng_;
   uint16_t *src_buf;
   uint16_t *dgd_buf;
+  int16_t *buf;
 };
 
 void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
@@ -595,6 +604,9 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = run_times == 1 ? kIterations : 2;
+  int16_t *dgd_avg = buf;
+  int16_t *src_avg =
+      buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
       dgd_buf[i] = rng_.Rand16() % (1 << bit_depth);
@@ -607,16 +619,17 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
-      av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end,
-                                 v_start, v_end, dgd_stride, src_stride, M_ref,
-                                 H_ref, bit_depth);
+      av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+                                 h_start, h_end, v_start, v_end, dgd_stride,
+                                 src_stride, M_ref, H_ref, bit_depth);
     }
     aom_usec_timer_mark(&timer);
     const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
-      target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
-                   dgd_stride, src_stride, M_test, H_test, bit_depth);
+      target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end,
+                   v_start, v_end, dgd_stride, src_stride, M_test, H_test,
+                   bit_depth);
     }
     aom_usec_timer_mark(&timer);
     const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -663,6 +676,9 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = 1;
+  int16_t *dgd_avg = buf;
+  int16_t *src_avg =
+      buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     // Fill with alternating extreme values to maximize difference with
     // the average.
@@ -674,12 +690,13 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
         dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
     const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
 
-    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
-                               v_end, dgd_stride, src_stride, M_ref, H_ref,
-                               bit_depth);
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+                               h_start, h_end, v_start, v_end, dgd_stride,
+                               src_stride, M_ref, H_ref, bit_depth);
 
-    target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
-                 dgd_stride, src_stride, M_test, H_test, bit_depth);
+    target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end,
+                 v_start, v_end, dgd_stride, src_stride, M_test, H_test,
+                 bit_depth);
 
     int failed = 0;
     for (int i = 0; i < wiener_win2; ++i) {
-- 
GitLab


From 20c535db99c2d9945440d22895fa8394f13b6f3c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 10 Apr 2024 16:47:55 +0100
Subject: [PATCH 058/391] Cosmetic: cleanup arm aom_convolve8 functions

Remove forward declarations and change comment style to align with the
rest of the files in this directory. Also use the same style as libvpx
for helper functions where applicable.

Change-Id: I3b85f943b2aaedde76ba5cd54dbf9f5cfd94b34e
---
 aom_dsp/arm/aom_convolve8_neon.c         | 254 +++++++++---------
 aom_dsp/arm/aom_convolve8_neon_dotprod.c | 312 +++++++++++------------
 aom_dsp/arm/aom_convolve8_neon_i8mm.c    | 220 ++++++++--------
 3 files changed, 376 insertions(+), 410 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 7441108b01..142aaf82d8 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -31,14 +31,14 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x8_t filter) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum;
 
-  sum = vmul_lane_s16(s0, filter_lo, 0);
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
   sum = vmla_lane_s16(sum, s1, filter_lo, 1);
   sum = vmla_lane_s16(sum, s2, filter_lo, 2);
   sum = vmla_lane_s16(sum, s5, filter_hi, 1);
   sum = vmla_lane_s16(sum, s6, filter_hi, 2);
   sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+
   sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
   sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
   return sum;
@@ -51,14 +51,14 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t filter) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum;
 
-  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
   sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
   sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
   return vqrshrun_n_s16(sum, FILTER_BITS);
@@ -69,8 +69,6 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h) {
-  const int16x8_t filter = vld1q_s16(filter_x);
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
 
@@ -80,36 +78,38 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (h == 4) {
-    uint8x8_t t0, t1, t2, t3, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+  const int16x8_t filter = vld1q_s16(filter_x);
 
+  if (h == 4) {
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
     src += 7;
 
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       transpose_elems_inplace_u8_4x4(&d01, &d23);
 
@@ -123,39 +123,40 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+
       src += 4;
       dst += 4;
       w -= 4;
     } while (w != 0);
   } else {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     if (w == 4) {
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
         transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
                                &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
 
@@ -169,48 +170,49 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         h -= 8;
       } while (h > 0);
     } else {
-      uint8x8_t d4, d5, d6, d7;
-      int16x8_t s11, s12, s13, s14;
-      int width;
-      const uint8_t *s;
-      uint8_t *d;
-
       do {
-        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int width = w;
+        const uint8_t *s = src;
+        uint8_t *d = dst;
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        width = w;
-        s = src + 7;
-        d = dst;
+
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        s += 7;
 
         do {
           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
                                          &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
           transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
                                          &d7);
@@ -224,6 +226,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s4 = s12;
           s5 = s13;
           s6 = s14;
+
           s += 8;
           d += 8;
           width -= 8;
@@ -253,33 +256,33 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
   if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
-    s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
-    s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
 
     src += 7 * src_stride;
 
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
@@ -291,42 +294,40 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-
     do {
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      height = h;
-      s = src + 7 * src_stride;
-      d = dst;
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      int height = h;
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
 
       do {
         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -337,6 +338,7 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s4 = s8;
         s5 = s9;
         s6 = s10;
+
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 4a94626f26..120c479798 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -24,76 +24,72 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+// Filter values always sum to 128.
+#define FILTER_WEIGHT 128
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
@@ -103,9 +99,6 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  const int32x4_t correction = vdupq_n_s32(128 << FILTER_BITS);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -117,19 +110,17 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   src -= ((SUBPEL_TAPS / 2) - 1);
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
     do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
@@ -139,23 +130,20 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h > 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -221,41 +209,38 @@ static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum;
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+                                      const int8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
 
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
-  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT);
+  int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Further narrowing and packing is performed by the caller. */
+  // Further narrowing and packing is performed by the caller.
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
-                                                 const int8x16_t samples0_hi,
-                                                 const int8x16_t samples1_lo,
-                                                 const int8x16_t samples1_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
-  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
-  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+                                      const int8x16_t samples0_hi,
+                                      const int8x16_t samples1_lo,
+                                      const int8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
@@ -265,9 +250,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const int32x4_t correction = vdupq_n_s32(128 << FILTER_BITS);
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   int8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
@@ -284,18 +267,17 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
+    // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
     int8x16_t s0123, s1234, s2345, s3456;
     transpose_concat_4x4(s0, s1, s2, s3, &s0123);
     transpose_concat_4x4(s1, s2, s3, s4, &s1234);
@@ -306,34 +288,33 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       uint8x8_t t7, t8, t9, t10;
       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
       int8x16_t s4567, s5678, s6789, s78910;
       transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
+      // Merge new data into block from previous iteration.
       samples_LUT.val[0] = s3456;
       samples_LUT.val[1] = s78910;
       s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      int16x4_t d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
-      int16x4_t d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
-      int16x4_t d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
-      int16x4_t d3 =
-          convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -353,18 +334,17 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
       s += 7 * src_stride;
 
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
+      // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
       int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
           s3456_lo, s3456_hi;
       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
@@ -376,16 +356,16 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
         uint8x8_t t7, t8, t9, t10;
         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
         int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
             s78910_lo, s78910_hi;
         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
+        // Merge new data into block from previous iteration.
         samples_LUT.val[0] = s3456_lo;
         samples_LUT.val[1] = s78910_lo;
         s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
@@ -398,19 +378,19 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        uint8x8_t d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi,
-                                                s4567_hi, correction, filter);
-        uint8x8_t d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi,
-                                                s5678_hi, correction, filter);
-        uint8x8_t d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi,
-                                                s6789_hi, correction, filter);
-        uint8x8_t d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi,
-                                                s78910_hi, correction, filter);
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 31b324aeb8..68e031461d 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -23,64 +23,60 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
+  // Further narrowing and packing is performed by the caller.
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x3_t permute_tbl) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
@@ -90,7 +86,6 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -102,19 +97,17 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= ((SUBPEL_TAPS / 2) - 1);
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
     do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
@@ -124,23 +117,20 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h > 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -207,37 +197,33 @@ static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
   *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum;
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
-  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+                                      const uint8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Further narrowing and packing is performed by the caller. */
+  // Further narrowing and packing is performed by the caller.
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
-                                                  const uint8x16_t samples0_hi,
-                                                  const uint8x16_t samples1_lo,
-                                                  const uint8x16_t samples1_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+                                      const uint8x16_t samples0_hi,
+                                      const uint8x16_t samples1_lo,
+                                      const uint8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+
+  // First 4 output values.
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
@@ -247,7 +233,7 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_y, int y_step_q4, int w,
                                   int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   uint8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
@@ -264,9 +250,8 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
     uint8x16_t s0123, s1234, s2345, s3456;
     transpose_concat_4x4(s0, s1, s2, s3, &s0123);
     transpose_concat_4x4(s1, s2, s3, s4, &s1234);
@@ -280,25 +265,25 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       uint8x16_t s4567, s5678, s6789, s78910;
       transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
+      // Merge new data into block from previous iteration.
       samples_LUT.val[0] = s3456;
       samples_LUT.val[1] = s78910;
       s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
       s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
-      int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
-      int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
-      int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -318,9 +303,8 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
       uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
           s3456_lo, s3456_hi;
       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
@@ -336,7 +320,7 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
             s78910_lo, s78910_hi;
         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
+        // Merge new data into block from previous iteration.
         samples_LUT.val[0] = s3456_lo;
         samples_LUT.val[1] = s78910_lo;
         s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
@@ -349,19 +333,19 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-        uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi,
-                                                 s4567_hi, filter);
-        uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi,
-                                                 s5678_hi, filter);
-        uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi,
-                                                 s6789_hi, filter);
-        uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi,
-                                                 s78910_hi, filter);
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
-- 
GitLab


From 23c94347d84241c322f3b40daf120047ff4f8d56 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 15 Apr 2024 11:30:56 +0100
Subject: [PATCH 059/391] Add 4-tap specialisation to aom_convolve8_horiz_neon

Add specialised path for 4-tap filters in aom_convolve8_horiz_neon. This
gives between 40% and 50% uplift compared to using the 8-tap path.

Change-Id: If48815ef9d1d8fe6882f3a1290d8a7cbffc57e9e
---
 aom_dsp/arm/aom_convolve8_neon.c | 151 ++++++++++++++++++++++++++++---
 1 file changed, 137 insertions(+), 14 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 142aaf82d8..6a177b2e6b 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@@ -64,20 +65,11 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
+static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
+                                             ptrdiff_t src_stride, uint8_t *dst,
+                                             ptrdiff_t dst_stride,
+                                             const int16_t *filter_x, int w,
+                                             int h) {
   const int16x8_t filter = vld1q_s16(filter_x);
 
   if (h == 4) {
@@ -239,6 +231,137 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t filter) {
+  int16x4_t sum = vmul_lane_s16(s0, filter, 0);
+  sum = vmla_lane_s16(sum, s1, filter, 1);
+  sum = vmla_lane_s16(sum, s2, filter, 2);
+  sum = vmla_lane_s16(sum, s3, filter, 3);
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
+                                             ptrdiff_t src_stride, uint8_t *dst,
+                                             ptrdiff_t dst_stride,
+                                             const int16_t *filter_x, int w,
+                                             int h) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
+
+  if (w == 4) {
+    do {
+      int16x8_t t0 =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride)));
+      int16x8_t t1 =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride)));
+
+      int16x4_t s0[4], s1[4];
+      s0[0] = vget_low_s16(t0);
+      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
+      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
+      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+
+      s1[0] = vget_low_s16(t1);
+      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
+      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
+      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+
+      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
+      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      int16x8_t t0 =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
+      int16x8_t t1 =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
+
+      s += 8;
+      do {
+        int16x8_t t2 =
+            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
+        int16x8_t t3 =
+            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
+
+        int16x8_t s0[4], s1[4];
+        s0[0] = t0;
+        s0[1] = vextq_s16(t0, t2, 1);
+        s0[2] = vextq_s16(t0, t2, 2);
+        s0[3] = vextq_s16(t0, t2, 3);
+
+        s1[0] = t1;
+        s1[1] = vextq_s16(t1, t3, 1);
+        s1[2] = vextq_s16(t1, t3, 2);
+        s1[3] = vextq_s16(t1, t3, 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+
+        store_u8_8x2(d, dst_stride, d0, d1);
+
+        t0 = t2;
+        t1 = t3;
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (get_filter_taps_convolve8(filter_x) <= 4) {
+    convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else {
+    convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h);
+  }
+}
+
 void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
-- 
GitLab


From 06b5b2f032d71529f675bb030fba90a82e510728 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 9 Apr 2024 17:48:50 +0100
Subject: [PATCH 060/391] Add SVE implementation of av1_compute_stats_highbd

Add SVE implementation of av1_compute_stats_highbd as well as the
corresponding tests. This gives around 45% uplift over the Neon
implementation.

Some helper functions are common with the standard bitdepth, so move
them to a separate header file.

Change-Id: I612bffef53bd28b989fc3979b119d23ab49011f1
---
 av1/av1.cmake                             |   3 +
 av1/common/av1_rtcd_defs.pl               |   2 +-
 av1/encoder/arm/neon/highbd_pickrst_sve.c | 441 ++++++++++++++++++++++
 av1/encoder/arm/neon/pickrst_sve.c        | 132 +------
 av1/encoder/arm/neon/pickrst_sve.h        | 151 ++++++++
 av1/encoder/pickrst.c                     |  20 +-
 test/wiener_test.cc                       |   5 +
 7 files changed, 618 insertions(+), 136 deletions(-)
 create mode 100644 av1/encoder/arm/neon/highbd_pickrst_sve.c
 create mode 100644 av1/encoder/arm/neon/pickrst_sve.h

diff --git a/av1/av1.cmake b/av1/av1.cmake
index b6cf974aa7..f156a1926f 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -502,6 +502,9 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
+              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_sve.c")
 endif()
 
 if(CONFIG_ACCOUNTING)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 6a0043c761..7d917eb8b1 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -470,7 +470,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
       specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
       add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-      specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
+      specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon sve/;
     }
   }
 
diff --git a/av1/encoder/arm/neon/highbd_pickrst_sve.c b/av1/encoder/arm/neon/highbd_pickrst_sve.c
new file mode 100644
index 0000000000..3ffd6749dc
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_pickrst_sve.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/arm/neon/pickrst_sve.h"
+
+static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
+                                        int width, int height) {
+  uint64x2_t avg_u64 = vdupq_n_u64(0);
+  uint16x8_t ones = vdupq_n_u16(1);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  int h = height;
+  do {
+    int j = width;
+    const uint16_t *src_ptr = src;
+    while (j > 8) {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      avg_u64 = aom_udotq_u16(avg_u64, s, ones);
+
+      j -= 8;
+      src_ptr += 8;
+    }
+    uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr));
+    avg_u64 = aom_udotq_u16(avg_u64, s_end, ones);
+
+    src += src_stride;
+  } while (--h != 0);
+  return (uint16_t)(vaddvq_u64(avg_u64) / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
+                                   int16_t avg, int16_t *buf_avg,
+                                   int buf_avg_stride, int width, int height) {
+  uint16x8_t avg_u16 = vdupq_n_u16(avg);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg));
+
+  do {
+    int j = width;
+    const uint16_t *buf_ptr = buf;
+    int16_t *buf_avg_ptr = buf_avg;
+    while (j > 8) {
+      uint16x8_t d = vld1q_u16(buf_ptr);
+      vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16)));
+
+      j -= 8;
+      buf_ptr += 8;
+      buf_avg_ptr += 8;
+    }
+    uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr));
+    vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end)));
+
+    buf += buf_stride;
+    buf_avg += buf_avg_stride;
+  } while (--height > 0);
+}
+
+static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+                                       const int wiener_win2,
+                                       const int divider) {
+  for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+    // Transpose the first 2x2 square. It needs a special case as the element
+    // of the bottom left is on the diagonal.
+    int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
+    int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
+
+    int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+    vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+    vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
+
+    // Transpose and store all the remaining 2x2 squares of the line.
+    for (int j = i + 3; j < wiener_win2; j = j + 2) {
+      row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
+      row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
+
+      int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+      int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+      vst1q_s64(H_tmp + (j + 0) * wiener_win2 + i, tr_row0);
+      vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
+    }
+  }
+  for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
+    H[i] += H_tmp[i] / divider;
+  }
+}
+
+// Transpose the matrix that has just been computed and accumulate it in M.
+static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+                                   const int wiener_win, const int divider) {
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = 0; j < wiener_win; ++j) {
+      int tr_idx = j * wiener_win + i;
+      *M++ += (int64_t)(M_trn[tr_idx] / divider);
+    }
+  }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win7_sve(
+    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
+    int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
+  const int wiener_win = 7;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[49];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[49 * 49];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[7];
+        load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j < width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[7];
+        load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a predicate to discard excess elements.
+    for (int col0 = 0; col0 < wiener_win; col0++) {
+      // Load first column.
+      int16x8_t dgd0[7];
+      dgd0[0] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+      dgd0[1] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+      dgd0[2] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+      dgd0[3] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+      dgd0[4] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+      dgd0[5] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
+      dgd0[6] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
+
+      // Perform computation of the first column with itself (28 elements).
+      // For the first column this will fill the upper triangle of the 7x7
+      // matrix at the top left of the H matrix. For the next columns this
+      // will fill the upper triangle of the other 7x7 matrices around H's
+      // diagonal.
+      compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+      // All computation next to the matrix diagonal has already been done.
+      for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+        // Load second column and scale based on downsampling factor.
+        int16x8_t dgd1[7];
+        load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                     &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+        // Compute all elements from the combination of both columns (49
+        // elements).
+        compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+      }
+    }
+    dgd_avg += dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 7, bit_depth_divider);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win5_sve(
+    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
+    int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
+  const int wiener_win = 5;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[25];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[25 * 25];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[5];
+        load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j < width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4]);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4]);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a predicate to discard excess elements.
+    for (int col0 = 0; col0 < wiener_win; col0++) {
+      int16x8_t dgd0[5];
+      dgd0[0] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+      dgd0[1] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+      dgd0[2] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+      dgd0[3] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+      dgd0[4] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+
+      // Perform computation of the first column with itself (15 elements).
+      // For the first column this will fill the upper triangle of the 5x5
+      // matrix at the top left of the H matrix. For the next columns this
+      // will fill the upper triangle of the other 5x5 matrices around H's
+      // diagonal.
+      compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+      // All computation next to the matrix diagonal has already been done.
+      for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+        // Load second column and scale based on downsampling factor.
+        int16x8_t dgd1[5];
+        load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                     &dgd1[2], &dgd1[3], &dgd1[4]);
+
+        // Compute all elements from the combination of both columns (25
+        // elements).
+        compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+      }
+    }
+    dgd_avg += dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 5, bit_depth_divider);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
+}
+
+void av1_compute_stats_highbd_sve(int wiener_win, const uint8_t *dgd8,
+                                  const uint8_t *src8, int16_t *dgd_avg,
+                                  int16_t *src_avg, int h_start, int h_end,
+                                  int v_start, int v_end, int dgd_stride,
+                                  int src_stride, int64_t *M, int64_t *H,
+                                  aom_bit_depth_t bit_depth) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = wiener_win >> 1;
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const uint16_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
+
+  const uint16_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
+
+  // dgd_avg and src_avg have been memset to zero before calling this function
+  // so round up the stride to the next multiple of 8 so that we don't have to
+  // worry about a tail loop when computing M.
+  const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
+  const int src_avg_stride = (width & ~7) + 8;
+
+  // Compute (dgd - avg) and store it in dgd_avg.
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin);
+
+  // Compute (src - avg), downsample if necessary and store in src-avg.
+  const uint16_t *src_start = src + h_start + v_start * src_stride;
+  compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
+                  height);
+
+  if (wiener_win == WIENER_WIN) {
+    highbd_compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg,
+                                  src_avg_stride, width, height, M, H,
+                                  bit_depth_divider);
+  } else {
+    highbd_compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg,
+                                  src_avg_stride, width, height, M, H,
+                                  bit_depth_divider);
+  }
+}
diff --git a/av1/encoder/arm/neon/pickrst_sve.c b/av1/encoder/arm/neon/pickrst_sve.c
index a519ecc5f5..88aa135e25 100644
--- a/av1/encoder/arm/neon/pickrst_sve.c
+++ b/av1/encoder/arm/neon/pickrst_sve.c
@@ -22,6 +22,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
+#include "av1/encoder/arm/neon/pickrst_sve.h"
 
 static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
                                        int width, int height) {
@@ -123,137 +124,6 @@ static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
   }
 }
 
-// Swap each half of the dgd vectors so that we can accumulate the result of
-// the dot-products directly in the destination matrix.
-static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
-  int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
-      vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
-  int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
-      vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
-
-  return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
-}
-
-static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
-                                          int64_t *M, int row) {
-  const int wiener_win = 5;
-
-  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
-  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
-
-  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
-  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
-  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
-
-  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
-  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
-
-  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
-  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
-  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
-
-  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
-  M[row * wiener_win + 4] += vaddvq_s64(m4);
-}
-
-static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
-                                          int64_t *M, int row) {
-  const int wiener_win = 7;
-
-  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
-  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
-
-  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
-  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
-  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
-
-  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
-  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
-
-  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
-  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
-  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
-
-  int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
-  int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
-
-  int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
-  cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
-  vst1q_s64(M + row * wiener_win + 4, cross_corr45);
-
-  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
-  M[row * wiener_win + 6] += vaddvq_s64(m6);
-}
-
-static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
-                                     const int wiener_win,
-                                     const int wiener_win2) {
-  for (int row0 = 0; row0 < wiener_win; row0++) {
-    for (int row1 = row0; row1 < wiener_win; row1++) {
-      int auto_cov_idx =
-          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
-
-      int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
-      H[auto_cov_idx] += vaddvq_s64(auto_cov);
-    }
-  }
-}
-
-static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
-                                           int row0, int row1, int64_t *H) {
-  for (int col0 = 0; col0 < 5; col0++) {
-    int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
-
-    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
-    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
-
-    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
-    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
-    vst1q_s64(H + auto_cov_idx, auto_cov01);
-
-    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
-    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
-
-    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
-    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
-    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
-
-    int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
-    H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
-  }
-}
-
-static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
-                                           int row0, int row1, int64_t *H) {
-  for (int col0 = 0; col0 < 7; col0++) {
-    int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
-
-    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
-    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
-
-    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
-    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
-    vst1q_s64(H + auto_cov_idx, auto_cov01);
-
-    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
-    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
-
-    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
-    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
-    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
-
-    int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
-    int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
-
-    int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
-    auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
-    vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
-
-    int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
-    H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
-  }
-}
-
 // This function computes two matrices: the cross-correlation between the src
 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
 //
diff --git a/av1/encoder/arm/neon/pickrst_sve.h b/av1/encoder/arm/neon/pickrst_sve.h
new file mode 100644
index 0000000000..ffa737611e
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_sve.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+
+// Swap each half of the dgd vectors so that we can accumulate the result of
+// the dot-products directly in the destination matrix.
+static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
+  int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
+      vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+  int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
+      vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+
+  return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
+}
+
+static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
+                                          int64_t *M, int row) {
+  const int wiener_win = 5;
+
+  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
+  M[row * wiener_win + 4] += vaddvq_s64(m4);
+}
+
+static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
+                                          int64_t *M, int row) {
+  const int wiener_win = 7;
+
+  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+  int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
+  int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
+
+  int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
+  cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
+  vst1q_s64(M + row * wiener_win + 4, cross_corr45);
+
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
+  M[row * wiener_win + 6] += vaddvq_s64(m6);
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+                                     const int wiener_win,
+                                     const int wiener_win2) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = row0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+      int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
+      H[auto_cov_idx] += vaddvq_s64(auto_cov);
+    }
+  }
+}
+
+static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
+                                           int row0, int row1, int64_t *H) {
+  for (int col0 = 0; col0 < 5; col0++) {
+    int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
+
+    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+    int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
+    H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
+  }
+}
+
+static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
+                                           int row0, int row1, int64_t *H) {
+  for (int col0 = 0; col0 < 7; col0++) {
+    int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
+
+    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+    int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
+    int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
+
+    int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
+    auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
+    vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
+
+    int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
+    H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index a431c4dada..7c30e3a9d9 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -2085,10 +2085,17 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   // and height aligned to multiple of 16 is considered for intrinsic purpose.
   rsc.dgd_avg = NULL;
   rsc.src_avg = NULL;
-#if HAVE_AVX2
+#if HAVE_AVX2 || HAVE_SVE
   // The buffers allocated below are used during Wiener filter processing.
-  // Hence, allocate the same when Wiener filter is enabled.
-  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+  // Hence, allocate the same when Wiener filter is enabled. Make sure to
+  // allocate these buffers only for the SIMD extensions that make use of them
+  // (i.e. AVX2 for low bitdepth and SVE for low and high bitdepth).
+#if HAVE_AVX2
+  bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd;
+#elif HAVE_SVE
+  bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter;
+#endif
+  if (allocate_buffers) {
     const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
                          RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
     CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
@@ -2224,8 +2231,13 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
                               best_luma_unit_size);
   }
 
+#if HAVE_AVX2 || HAVE_SVE
 #if HAVE_AVX2
-  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+  bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd;
+#elif HAVE_SVE
+  bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter;
+#endif
+  if (free_buffers) {
     aom_free(cpi->pick_lr_ctxt.dgd_avg);
     cpi->pick_lr_ctxt.dgd_avg = NULL;
   }
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index c38e10e3c2..3f9ce9ff41 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -766,6 +766,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, WienerTestHighbd,
                          ::testing::Values(av1_compute_stats_highbd_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(SVE, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_sve));
+#endif  // HAVE_SVE
+
 // A test that reproduces b/274668506: signed integer overflow in
 // update_a_sep_sym().
 TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateASepSym) {
-- 
GitLab


From 9a455ac036d9ef6482796b70bf27fd69cd0b58a8 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 12 Apr 2024 15:34:32 +0100
Subject: [PATCH 061/391] Add 4-tap specialisation to
 aom_convolve8_horiz_neon_dotprod

Add specialised path for 4-tap filters in
aom_convolve8_horiz_neon_dotprod. This gives between 30% and 50% uplift
compared to using the 8-tap path.

This is a port from libvpx, using hash
9d8d71b41bd5b5be78c765bb099cbde84b99f25c.

Change-Id: I48fb2aa557cb0d8b5a70a4545905e63d9ed2c86f
---
 aom_dsp/arm/aom_convolve8_neon_dotprod.c | 148 ++++++++++++++++++++---
 1 file changed, 134 insertions(+), 14 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 120c479798..576db8e4fc 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@@ -93,22 +94,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h) {
+static INLINE void convolve8_horiz_8tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
     do {
@@ -158,6 +148,136 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  // (Divide by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  // (Divide by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_horiz_4tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int w = width;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (get_filter_taps_convolve8(filter_x) <= 4) {
+    convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
+                                      filter_x, w, h);
+  } else {
+    convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride,
+                                      filter_x, w, h);
+  }
+}
+
 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b) {
   // Transpose 8-bit elements and concatenate result rows as follows:
-- 
GitLab


From e7dcb10a44ee19b5a9d0654aad9358f49e8ec962 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 12 Apr 2024 16:36:54 +0100
Subject: [PATCH 062/391] Add 4-tap specialisation to
 aom_convolve8_horiz_neon_i8mm

Add specialised path for 4-tap filters in aom_convolve8_horiz_neon_i8mm.
This gives between 30% and 50% uplift compared to using the 8-tap path.

This is a port from libpvx, using hash
9d8d71b41bd5b5be78c765bb099cbde84b99f25c.

Change-Id: I7913edd7c5d8546bc5ec0800dac2a1866c9f3739
---
 aom_dsp/arm/aom_convolve8_neon_i8mm.c | 135 +++++++++++++++++++++++---
 1 file changed, 121 insertions(+), 14 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 68e031461d..da0210ac3c 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@@ -80,22 +81,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
+static INLINE void convolve8_horiz_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
     do {
@@ -145,6 +135,123 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_horiz_4tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      int w = width;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (get_filter_taps_convolve8(filter_x) <= 4) {
+    convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
+                                   filter_x, w, h);
+  } else {
+    convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
+                                   w, h);
+  }
+}
+
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
                                         uint8x16_t *b) {
-- 
GitLab


From 0384c6ba0f5e979021807e52c7197b511336b4a3 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 12 Apr 2024 11:51:17 +0100
Subject: [PATCH 063/391] Test aom_convolve8 with bilinear filters

The unit-tests for aom_convolve8 are only covering 4-tap and 8-tap
filters. Add coverage for bilinear (2-tap) filters as well.

Change-Id: I1bf4ab4000db66ff192fab28b08f8c43af450660
---
 test/convolve_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index cab590927b..41e838ac6a 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -474,7 +474,7 @@ class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
       ref = CONVERT_TO_BYTEPTR(ref16_);
     }
     int subpel_search;
-    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+    for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS;
          ++subpel_search) {
       for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
         const InterpFilter filter = (InterpFilter)filter_bank;
@@ -555,7 +555,7 @@ class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
         }
         if (axis) seed_val += 8;
         int subpel_search;
-        for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+        for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS;
              ++subpel_search) {
           for (int filter_bank = 0; filter_bank < kNumFilterBanks;
                ++filter_bank) {
@@ -687,7 +687,7 @@ TEST_P(LowbdConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
 void FiltersWontSaturateWhenAddedPairwise() {
   int subpel_search;
-  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+  for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS;
        ++subpel_search) {
     for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
       const InterpFilter filter = (InterpFilter)filter_bank;
-- 
GitLab


From 42e3156b5343e725aa79cda40a464a891e6c8735 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 16 Apr 2024 11:39:22 +0100
Subject: [PATCH 064/391] Add 2-tap path for aom_convolve8_horiz_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in all 3 Neon versions of
aom_convolve8_horiz. This provides between 20% and 60% uplift depending
on the architecture extension.

Change-Id: I48da9553cd391dd801affdc1c62995d1f1a48f15
---
 aom_dsp/arm/aom_convolve8_neon.c         |   8 +-
 aom_dsp/arm/aom_convolve8_neon.h         | 106 +++++++++++++++++++++++
 aom_dsp/arm/aom_convolve8_neon_dotprod.c |   8 +-
 aom_dsp/arm/aom_convolve8_neon_i8mm.c    |   8 +-
 4 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 aom_dsp/arm/aom_convolve8_neon.h

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 6a177b2e6b..5665b5e125 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -354,7 +355,12 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
                               h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
new file mode 100644
index 0000000000..0aebc6d12e
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
+                                             ptrdiff_t src_stride, uint8_t *dst,
+                                             ptrdiff_t dst_stride,
+                                             const int16_t *filter_x, int w,
+                                             int h) {
+  // Bilinear filter values are all positive.
+  const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]);
+  const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]);
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0 =
+          load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride);
+      uint8x8_t s1 =
+          load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride);
+      uint8x8_t s2 =
+          load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride);
+      uint8x8_t s3 =
+          load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0);
+      uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1);
+      uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0);
+      uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      vst1_u8(dst + 0 * dst_stride, d0);
+      vst1_u8(dst + 1 * dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s + 0);
+        uint8x16_t s1 = vld1q_u8(s + 1);
+
+        uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
+        sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
+        uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
+        sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
+
+        uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+        uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h > 0);
+  }
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 576db8e4fc..f49d33ff32 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -269,7 +270,12 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
                                       filter_x, w, h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index da0210ac3c..9727639990 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -243,7 +244,12 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
                                    filter_x, w, h);
   } else {
-- 
GitLab


From 32f8079ce4d9a5b9a7553100782ab124dfc0be64 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 16 Apr 2024 15:25:39 +0100
Subject: [PATCH 065/391] Add 4-tap specialisation to aom_convolve8_vert_neon

Add a specialized Neon path for 4-tap filters in
aom_convolve8_vert_neon, and use it for the neon_dotprod and neon_i8mm
versions as well. This gives between 40% and 50% uplift compared to
using the 8-tap path.

Change-Id: I93e48a62d851af5ff0c6f8015cb91f687c970802
---
 aom_dsp/arm/aom_convolve8_neon.c         | 53 ++++++-------
 aom_dsp/arm/aom_convolve8_neon.h         | 98 ++++++++++++++++++++++++
 aom_dsp/arm/aom_convolve8_neon_dotprod.c | 40 ++++++----
 aom_dsp/arm/aom_convolve8_neon_i8mm.c    | 40 ++++++----
 aom_dsp/arm/mem_neon.h                   | 10 +++
 5 files changed, 187 insertions(+), 54 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 5665b5e125..43aef5428d 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -243,18 +243,6 @@ static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x4_t filter) {
-  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter, 3);
-
-  // We halved the filter values so -1 from right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
@@ -368,22 +356,13 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
+static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
   const int16x8_t filter = vld1q_s16(filter_y);
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -478,3 +457,25 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index 0aebc6d12e..83fbd0afc9 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -103,4 +103,102 @@ static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
   }
 }
 
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
+      uint8_t *d = dst;
+
+      do {
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index f49d33ff32..4d47d86ef6 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -370,24 +370,13 @@ static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h) {
+static INLINE void convolve8_vert_8tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   int8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -536,3 +525,26 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y,
+                                     w, h);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 9727639990..21a4551a3b 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -340,24 +340,13 @@ static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+static INLINE void convolve8_vert_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   uint8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -478,3 +467,26 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
+                                  h);
+  }
+}
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 32a462a186..ba187007c1 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -174,6 +174,16 @@ static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
   *s3 = vld1_u8(s);
 }
 
+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3) {
-- 
GitLab


From 5eb58adeeeeb9ec735d091d31a6d51c8a3c52f2c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 17 Apr 2024 11:06:58 +0100
Subject: [PATCH 066/391] Add 2-tap path for aom_convolve8_vert_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in all 3 Neon versions of
aom_convolve8_vert. This provides around 50% uplift over using the
4-tap implementation.

Change-Id: I411b23905d5f93ea1a8f2cab7d5ece8006b94032
---
 aom_dsp/arm/aom_convolve8_neon.c         |  7 +-
 aom_dsp/arm/aom_convolve8_neon.h         | 81 ++++++++++++++++++++++++
 aom_dsp/arm/aom_convolve8_neon_dotprod.c |  7 +-
 aom_dsp/arm/aom_convolve8_neon_i8mm.c    |  7 +-
 4 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 43aef5428d..9a3ff8079c 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -472,7 +472,12 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
     convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
                              filter_y, w, h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index 83fbd0afc9..b523c41bc3 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -201,4 +201,85 @@ static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
   }
 }
 
+static INLINE void convolve8_vert_2tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
+  // Bilinear filter values are all positive.
+  uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]);
+  uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]);
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+      uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+      uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
+      uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0, s1, s2;
+      load_u8_8x3(src, src_stride, &s0, &s1, &s2);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s1, f0);
+      sum1 = vmlal_u8(sum1, s2, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      vst1_u8(dst + 0 * dst_stride, d0);
+      vst1_u8(dst + 1 * dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s + 0 * src_stride);
+        uint8x16_t s1 = vld1q_u8(s + 1 * src_stride);
+
+        uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
+        sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
+        uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
+        sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
+
+        uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+        uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h > 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 4d47d86ef6..7219570860 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -540,7 +540,12 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
     convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
                              filter_y, w, h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 21a4551a3b..34bfe01663 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -482,7 +482,12 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
     convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
                              filter_y, w, h);
   } else {
-- 
GitLab


From 04e3b1ce551c6daf8421828d853b5dad2b052d5b Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 17 Apr 2024 17:34:59 +0100
Subject: [PATCH 067/391] Refactor HBD Neon convolve8 functions

Refactor aom_highbd_convolve8_horiz_neon and
aom_highbd_convolve8_vert_neon to use the same implementation as the
other high bitdepth convolutions (convolve_sr_neon) and cleanup the
helper functions. This change is performance neutral for convolve8_vert
but gives a small 5-10% uplift for convolve8_horiz.

Change-Id: I6b637edad68c019f2863410097934c9b4ae165e2
---
 aom_dsp/arm/highbd_convolve8_neon.c | 304 +++++++++++-----------------
 1 file changed, 116 insertions(+), 188 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index e25438c9b4..f84b17f170 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -23,195 +23,133 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-static INLINE int32x4_t highbd_convolve8_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
-  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
-
-  return sum;
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filter, const uint16x4_t max) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+
+  return vmin_u16(res, max);
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
-  int32x4_t sum =
-      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
-  return vqrshrun_n_s32(sum, FILTER_BITS);
-}
-
-static INLINE int32x4_t highbd_convolve8_horiz4_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  const int16x8_t s2 = vextq_s16(s0, s1, 1);
-  const int16x8_t s3 = vextq_s16(s0, s1, 2);
-  const int16x8_t s4 = vextq_s16(s0, s1, 3);
-  const int16x4_t s0_lo = vget_low_s16(s0);
-  const int16x4_t s1_lo = vget_low_s16(s2);
-  const int16x4_t s2_lo = vget_low_s16(s3);
-  const int16x4_t s3_lo = vget_low_s16(s4);
-  const int16x4_t s4_lo = vget_high_s16(s0);
-  const int16x4_t s5_lo = vget_high_s16(s2);
-  const int16x4_t s6_lo = vget_high_s16(s3);
-  const int16x4_t s7_lo = vget_high_s16(s4);
-
-  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
-                                s7_lo, x_filter_0_7);
-}
-
-static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
-
-  return vqrshrun_n_s32(sum, FILTER_BITS);
-}
-
-static INLINE void highbd_convolve8_8_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
-    int32x4_t *sum0, int32x4_t *sum1) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
-  *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
-
-  *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
-}
-
-static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
-                                               const int16x8_t s0_hi,
-                                               const int16x8_t x_filter_0_7,
-                                               int32x4_t *sum0,
-                                               int32x4_t *sum1) {
-  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
-  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
-  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
-  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
-  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
-  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
-  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
-
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
-                         sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  int32x4_t sum0, sum1;
-  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
-}
-
-static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
-                         &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filter, const uint16x8_t max) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+
+  return vminq_u16(res, max);
 }
 
 static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
                                        ptrdiff_t src_stride, uint16_t *dst_ptr,
                                        ptrdiff_t dst_stride,
-                                       const int16_t *x_filter_ptr,
-                                       int x_step_q4, int w, int h, int bd) {
+                                       const int16_t *x_filter_ptr, int w,
+                                       int h, int bd) {
   assert(w >= 4 && h >= 4);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
     do {
-      int16x8_t s0, s1, s2, s3;
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], x_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], x_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], x_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], x_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
-      uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
-      uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
-
-      uint16x8_t d01 = vcombine_u16(d0, d1);
-      d01 = vminq_u16(d01, max);
-
-      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
     } while (h > 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     int height = h;
 
     do {
       int width = w;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
-      int x_q4 = 0;
-
-      const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
-      int16x8_t s0, s2, s4, s6;
-      load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
-      src_x += 8;
 
       do {
-        int16x8_t s1, s3, s5, s7;
-        load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
-
-        uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
-        uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
-        uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
-        uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], x_filter, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], x_filter, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], x_filter, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], x_filter, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        s0 = s1;
-        s2 = s3;
-        s4 = s5;
-        s6 = s7;
-        src_x += 8;
+        s += 8;
         d += 8;
         width -= 8;
-        x_q4 += 8 * x_step_q4;
       } while (width > 0);
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
@@ -236,8 +174,8 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
     src -= SUBPEL_TAPS / 2 - 1;
-    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, w, h, bd);
+    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x, w, h,
+                               bd);
   }
 }
 
@@ -248,9 +186,9 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
                                       int bd) {
   assert(w >= 4 && h >= 4);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -263,24 +201,15 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
 
       uint16x4_t d0 =
-          highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
       uint16x4_t d1 =
-          highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
       uint16x4_t d2 =
-          highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
       uint16x4_t d3 =
-          highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-      uint16x8_t d01 = vcombine_u16(d0, d1);
-      uint16x8_t d23 = vcombine_u16(d2, d3);
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-      vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -289,11 +218,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -307,19 +239,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
         int16x8_t s7, s8, s9, s10;
         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
-                                                   s7, y_filter);
-        uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
-                                                   s8, y_filter);
-        uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
-                                                   s9, y_filter);
-        uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
-                                                   s10, y_filter);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -330,6 +257,7 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
         s4 = s8;
         s5 = s9;
         s6 = s10;
+
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-- 
GitLab


From 68a56cc8679dddd79b1902be40c90dcb64197196 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 18 Apr 2024 15:06:37 +0100
Subject: [PATCH 068/391] Add 4-tap specialisation to
 aom_highbd_convolve8_horiz_neon

Add specialised path for 4-tap filters in
aom_highbd_convolve8_horiz_neon. This gives between 30% and 50% uplift
compared to using the 8-tap path.

Change-Id: I721498e71ba7f2dbeebfa68a78b08b0b5bca5a88
---
 aom_dsp/arm/highbd_convolve8_neon.c | 122 ++++++++++++++++++++++++++--
 1 file changed, 115 insertions(+), 7 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index f84b17f170..6d8ce29610 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@@ -77,11 +78,9 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   return vminq_u16(res, max);
 }
 
-static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
-                                       ptrdiff_t src_stride, uint16_t *dst_ptr,
-                                       ptrdiff_t dst_stride,
-                                       const int16_t *x_filter_ptr, int w,
-                                       int h, int bd) {
+static void highbd_convolve_horiz_8tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
   assert(w >= 4 && h >= 4);
   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
 
@@ -158,6 +157,109 @@ static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
   }
 }
 
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+
+  return vminq_u16(res, max);
+}
+
+static void highbd_convolve_horiz_4tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
 void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
@@ -174,8 +276,14 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
     src -= SUBPEL_TAPS / 2 - 1;
-    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x, w, h,
-                               bd);
+
+    if (get_filter_taps_convolve8(filter_x) <= 4) {
+      highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride,
+                                      filter_x, w, h, bd);
+    } else {
+      highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride,
+                                      filter_x, w, h, bd);
+    }
   }
 }
 
-- 
GitLab


From 276f8f8011388d70a41bfaac12a2725d660ad69a Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 18 Apr 2024 16:00:10 +0100
Subject: [PATCH 069/391] Add 2-tap path for aom_highbd_convolve8_horiz_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in both Neon and SVE Neon versions
of aom_highbd_convolve8_horiz. This provides between 40% and 80% uplift
over the 4-tap implementation.

Change-Id: Ie24189770a066e1155d0239ed6e80a9e8a7938ce
---
 aom_dsp/arm/highbd_convolve8_neon.c |  8 ++-
 aom_dsp/arm/highbd_convolve8_neon.h | 98 +++++++++++++++++++++++++++++
 aom_dsp/arm/highbd_convolve8_sve.c  |  8 ++-
 3 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 aom_dsp/arm/highbd_convolve8_neon.h

diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index 6d8ce29610..75d1cd4138 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -20,6 +20,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/highbd_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@@ -277,7 +278,12 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
 
     src -= SUBPEL_TAPS / 2 - 1;
 
-    if (get_filter_taps_convolve8(filter_x) <= 4) {
+    const int filter_taps = get_filter_taps_convolve8(filter_x);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
+                                       filter_x, w, h, bd);
+    } else if (filter_taps == 4) {
       highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride,
                                       filter_x, w, h, bd);
     } else {
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
new file mode 100644
index 0000000000..05cff79a95
--- /dev/null
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void highbd_convolve8_horiz_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
+        uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
+        uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
+        uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s2, f0);
+        sum23 = vmlaq_u16(sum23, s3, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+#endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index e57c41a0b0..ef977181b7 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -18,6 +18,7 @@
 
 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/highbd_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
@@ -252,7 +253,12 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
 
   src -= SUBPEL_TAPS / 2 - 1;
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  const int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
+                                     filter_x, width, height, bd);
+  } else if (filter_taps == 4) {
     highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride,
                                     filter_x, width, height, bd);
   } else {
-- 
GitLab


From d6d79d5bcd77156a21b3c0593eefd8fb5dcb3a56 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 19 Apr 2024 16:35:25 +0100
Subject: [PATCH 070/391] Fix asserts in SVE convolution functions

These asserts break when libaom is built in debug mode, fix them.

Change-Id: I57276cebbdb0062159d94e67dc2ca145a89925ba
---
 aom_dsp/arm/highbd_convolve8_sve.c    | 2 +-
 av1/common/arm/highbd_convolve_sve2.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index ef977181b7..fac65c9b65 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -667,7 +667,7 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
                                    const int16_t *filter_y, int y_step_q4,
                                    int width, int height, int bd) {
   assert(y_step_q4 == 16);
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
   (void)filter_x;
   (void)y_step_q4;
   (void)x_step_q4;
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index 82eb12fcea..e6e27719b4 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -566,7 +566,7 @@ void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
                                     int width, int height,
                                     const int16_t *filter_y, int bd) {
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
 
   const int16x8_t y_filter = vld1q_s16(filter_y);
 
@@ -735,7 +735,7 @@ void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
                                     int width, int height,
                                     const int16_t *filter_y, int bd) {
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
 
   const int16x8_t y_filter =
       vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
@@ -1352,7 +1352,7 @@ void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
                                           int height, const int16_t *filter_y,
                                           ConvolveParams *conv_params, int bd,
                                           const int y_offset) {
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
   const int64x2_t offset = vdupq_n_s64(y_offset);
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
   const int16x8_t y_filter = vld1q_s16(filter_y);
@@ -1542,7 +1542,7 @@ void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
                                           int height, const int16_t *filter_y,
                                           ConvolveParams *conv_params, int bd,
                                           const int y_offset) {
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
   const int64x2_t offset = vdupq_n_s64(y_offset);
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
 
-- 
GitLab


From 8e161f9cb1bfa330b34268b2bc2f2c000a09ec3b Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 19 Apr 2024 11:43:42 +0100
Subject: [PATCH 071/391] Add 4-tap specialisation to
 aom_highbd_convolve8_vert_neon

Add specialised path for 4-tap filters in
aom_highbd_convolve8_horiz_neon. This gives between 30% and 50% uplift
compared to using the 8-tap path.

Delete the 4-tap SVE path and use this new implementation instead, as it
is now faster.

Change-Id: Icf91eeb51fa227781872e907970974c2e9132df5
---
 aom_dsp/arm/highbd_convolve8_neon.c |  50 +++--------
 aom_dsp/arm/highbd_convolve8_neon.h | 104 +++++++++++++++++++++++
 aom_dsp/arm/highbd_convolve8_sve.c  | 125 +---------------------------
 3 files changed, 117 insertions(+), 162 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index 75d1cd4138..a433b95f78 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -158,38 +158,6 @@ static void highbd_convolve_horiz_8tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
-  int32x4_t sum = vmull_lane_s16(s0, filter, 0);
-  sum = vmlal_lane_s16(sum, s1, filter, 1);
-  sum = vmlal_lane_s16(sum, s2, filter, 2);
-  sum = vmlal_lane_s16(sum, s3, filter, 3);
-
-  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
-
-  return vmin_u16(res, max);
-}
-
-static INLINE uint16x8_t highbd_convolve4_8(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
-  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
-
-  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
-
-  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                                vqrshrun_n_s32(sum1, FILTER_BITS));
-
-  return vminq_u16(res, max);
-}
-
 static void highbd_convolve_horiz_4tap_neon(
     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
@@ -293,11 +261,9 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
-                                      ptrdiff_t src_stride, uint16_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      const int16_t *y_filter_ptr, int w, int h,
-                                      int bd) {
+static void highbd_convolve_vert_8tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
   assert(w >= 4 && h >= 4);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
@@ -399,7 +365,13 @@ void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
     src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
-    highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
-                              bd);
+
+    if (get_filter_taps_convolve8(filter_y) <= 4) {
+      highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else {
+      highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y,
+                                     w, h, bd);
+    }
   }
 }
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 05cff79a95..0777378f05 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -95,4 +95,108 @@ static INLINE void highbd_convolve8_horiz_2tap_neon(
   }
 }
 
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_vert_4tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max);
+        uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max);
+        uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max);
+        uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index fac65c9b65..789b38aa6a 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -540,127 +540,6 @@ static INLINE void highbd_convolve8_vert_8tap_sve(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter,
-                                              uint16x4_t max) {
-  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
-  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
-
-  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
-
-  return vmin_u16(res, max);
-}
-
-static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter,
-                                              uint16x8_t max) {
-  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
-  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
-  int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
-  int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
-
-  int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
-
-  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS),
-                                vqrshrun_n_s32(s4567, FILTER_BITS));
-
-  return vminq_u16(res, max);
-}
-
-static INLINE void highbd_convolve8_vert_4tap_sve(
-    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
-    int bd) {
-  const int16x8_t y_filter =
-      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
-
-  uint8x16_t merge_block_tbl[3];
-  merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
-  merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
-  merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
-
-  if (width == 4) {
-    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
-    int16_t *s = (int16_t *)src;
-
-    int16x4_t s0, s1, s2;
-    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
-    s += 3 * src_stride;
-
-    do {
-      int16x4_t s3, s4, s5, s6;
-      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
-
-      // This operation combines a conventional transpose and the sample permute
-      // required before computing the dot product.
-      int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
-      transpose_concat_4x4(s0, s1, s2, s3, s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, s3456);
-
-      uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max);
-      uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max);
-      uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max);
-      uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max);
-
-      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
-
-      // Shuffle everything up four rows.
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-
-      s += 4 * src_stride;
-      dst += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    do {
-      int h = height;
-      int16_t *s = (int16_t *)src;
-      uint16_t *d = dst;
-
-      int16x8_t s0, s1, s2;
-      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
-      s += 3 * src_stride;
-
-      do {
-        int16x8_t s3, s4, s5, s6;
-        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
-
-        // This operation combines a conventional transpose and the sample
-        // permute required before computing the dot product.
-        int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
-        transpose_concat_8x4(s0, s1, s2, s3, s0123);
-        transpose_concat_8x4(s1, s2, s3, s4, s1234);
-        transpose_concat_8x4(s2, s3, s4, s5, s2345);
-        transpose_concat_8x4(s3, s4, s5, s6, s3456);
-
-        uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max);
-        uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max);
-        uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max);
-        uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        // Shuffle everything up four rows.
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-      src += 8;
-      dst += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
 void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
                                    uint8_t *dst8, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
@@ -678,8 +557,8 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
   src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
   if (get_filter_taps_convolve8(filter_y) <= 4) {
-    highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst,
-                                   dst_stride, filter_y, width, height, bd);
+    highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
   } else {
     highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y,
                                    width, height, bd);
-- 
GitLab


From 6e20f87defacf025c68ee81c2c825ebc7e3936c0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 19 Apr 2024 14:16:39 +0100
Subject: [PATCH 072/391] Add 2-tap path for aom_highbd_convolve8_vert_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in both Neon and SVE Neon versions
of aom_highbd_convolve8_vert. This provides between 40% and 70% uplift
over the 4-tap implementation.

Change-Id: I0526e13599d8519f06c322e4317aeb943ebfd795
---
 aom_dsp/arm/highbd_convolve8_neon.c |  7 ++-
 aom_dsp/arm/highbd_convolve8_neon.h | 77 +++++++++++++++++++++++++++++
 aom_dsp/arm/highbd_convolve8_sve.c  |  7 ++-
 aom_dsp/arm/mem_neon.h              | 10 ++++
 4 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index a433b95f78..99ad0ba601 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -366,7 +366,12 @@ void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
 
     src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-    if (get_filter_taps_convolve8(filter_y) <= 4) {
+    const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else if (filter_taps == 4) {
       highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                       dst_stride, filter_y, w, h, bd);
     } else {
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 0777378f05..b87b4bad84 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -199,4 +199,81 @@ static INLINE void highbd_convolve8_vert_4tap_neon(
   }
 }
 
+static INLINE void highbd_convolve8_vert_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0, s1, s2;
+        load_u16_8x3(s, src_stride, &s0, &s1, &s2);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s1, f0);
+        sum23 = vmlaq_u16(sum23, s2, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 789b38aa6a..f519395e81 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -556,7 +556,12 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
 
   src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
+  } else if (filter_taps == 4) {
     highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                     dst_stride, filter_y, width, height, bd);
   } else {
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index ba187007c1..b1f6ebeb14 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -231,6 +231,16 @@ static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
   *s1 = vld1q_u16(s);
 }
 
+static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+}
+
 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3) {
-- 
GitLab


From 1ad5a42843c655af40156a09ae09845e33a60151 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 10 Apr 2024 14:12:49 +0100
Subject: [PATCH 073/391] Remove crc32/ and neon/ directories in
 av1/encoder/arm

The neon/ directory contains code for neon, neon_dotprod, neon_i8mm and
sve extensions, so it doesn't make sense anymore to have it. Move all
files under arm/ directly.

Change-Id: I56b6603d6f86a0053822553055a6efb06764f0c4
---
 av1/av1.cmake                                 | 54 +++++++++----------
 av1/encoder/arm/{neon => }/av1_error_neon.c   |  0
 av1/encoder/arm/{neon => }/av1_error_sve.c    |  0
 .../arm/{neon => }/av1_fwd_txfm2d_neon.c      |  0
 .../arm/{neon => }/av1_highbd_quantize_neon.c |  0
 av1/encoder/arm/{neon => }/av1_k_means_neon.c |  0
 .../{neon => }/av1_temporal_denoiser_neon.c   |  0
 av1/encoder/arm/{neon => }/cnn_neon.c         |  0
 av1/encoder/arm/{neon => }/encodetxb_neon.c   |  0
 av1/encoder/arm/{crc32 => }/hash_arm_crc32.c  |  0
 .../arm/{neon => }/highbd_fwd_txfm_neon.c     |  0
 .../arm/{neon => }/highbd_pickrst_neon.c      |  2 +-
 .../arm/{neon => }/highbd_pickrst_sve.c       |  2 +-
 .../arm/{neon => }/highbd_rdopt_neon.c        |  0
 .../{neon => }/highbd_temporal_filter_neon.c  |  0
 .../arm/{neon => }/hybrid_fwd_txfm_neon.c     |  0
 av1/encoder/arm/{neon => }/ml_neon.c          |  0
 av1/encoder/arm/{neon => }/pickrst_neon.c     |  2 +-
 av1/encoder/arm/{neon => }/pickrst_neon.h     |  6 +--
 av1/encoder/arm/{neon => }/pickrst_sve.c      |  2 +-
 av1/encoder/arm/{neon => }/pickrst_sve.h      |  6 +--
 av1/encoder/arm/{neon => }/quantize_neon.c    |  0
 av1/encoder/arm/{neon => }/rdopt_neon.c       |  0
 .../arm/{neon => }/reconinter_enc_neon.c      |  0
 av1/encoder/arm/{neon => }/shift_neon.h       |  6 +--
 .../arm/{neon => }/temporal_filter_neon.c     |  0
 .../{neon => }/temporal_filter_neon_dotprod.c |  0
 av1/encoder/arm/{neon => }/txfm_neon.h        |  6 +--
 av1/encoder/arm/{neon => }/wedge_utils_neon.c |  0
 av1/encoder/arm/{neon => }/wedge_utils_sve.c  |  0
 30 files changed, 43 insertions(+), 43 deletions(-)
 rename av1/encoder/arm/{neon => }/av1_error_neon.c (100%)
 rename av1/encoder/arm/{neon => }/av1_error_sve.c (100%)
 rename av1/encoder/arm/{neon => }/av1_fwd_txfm2d_neon.c (100%)
 rename av1/encoder/arm/{neon => }/av1_highbd_quantize_neon.c (100%)
 rename av1/encoder/arm/{neon => }/av1_k_means_neon.c (100%)
 rename av1/encoder/arm/{neon => }/av1_temporal_denoiser_neon.c (100%)
 rename av1/encoder/arm/{neon => }/cnn_neon.c (100%)
 rename av1/encoder/arm/{neon => }/encodetxb_neon.c (100%)
 rename av1/encoder/arm/{crc32 => }/hash_arm_crc32.c (100%)
 rename av1/encoder/arm/{neon => }/highbd_fwd_txfm_neon.c (100%)
 rename av1/encoder/arm/{neon => }/highbd_pickrst_neon.c (99%)
 rename av1/encoder/arm/{neon => }/highbd_pickrst_sve.c (99%)
 rename av1/encoder/arm/{neon => }/highbd_rdopt_neon.c (100%)
 rename av1/encoder/arm/{neon => }/highbd_temporal_filter_neon.c (100%)
 rename av1/encoder/arm/{neon => }/hybrid_fwd_txfm_neon.c (100%)
 rename av1/encoder/arm/{neon => }/ml_neon.c (100%)
 rename av1/encoder/arm/{neon => }/pickrst_neon.c (99%)
 rename av1/encoder/arm/{neon => }/pickrst_neon.h (97%)
 rename av1/encoder/arm/{neon => }/pickrst_sve.c (99%)
 rename av1/encoder/arm/{neon => }/pickrst_sve.h (97%)
 rename av1/encoder/arm/{neon => }/quantize_neon.c (100%)
 rename av1/encoder/arm/{neon => }/rdopt_neon.c (100%)
 rename av1/encoder/arm/{neon => }/reconinter_enc_neon.c (100%)
 rename av1/encoder/arm/{neon => }/shift_neon.h (93%)
 rename av1/encoder/arm/{neon => }/temporal_filter_neon.c (100%)
 rename av1/encoder/arm/{neon => }/temporal_filter_neon_dotprod.c (100%)
 rename av1/encoder/arm/{neon => }/txfm_neon.h (86%)
 rename av1/encoder/arm/{neon => }/wedge_utils_neon.c (100%)
 rename av1/encoder/arm/{neon => }/wedge_utils_sve.c (100%)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index f156a1926f..c057856733 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -354,33 +354,33 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h"
-            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/ml_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
+            "${AOM_ROOT}/av1/encoder/arm/quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
-            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
+            "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
+            "${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c"
+            "${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
-            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
+            "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
@@ -452,7 +452,7 @@ if(CONFIG_AV1_TEMPORAL_DENOISING)
               "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
+              "${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c")
 endif()
 
 if(CONFIG_AV1_HIGHBITDEPTH)
@@ -499,12 +499,12 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
+              "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_sve.c")
+              "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c")
 endif()
 
 if(CONFIG_ACCOUNTING)
@@ -530,7 +530,7 @@ if(CONFIG_REALTIME_ONLY)
                    "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
-                   "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c")
+                   "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
                    "${AOM_ROOT}/av1/encoder/cnn.c"
diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/av1_error_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_error_neon.c
rename to av1/encoder/arm/av1_error_neon.c
diff --git a/av1/encoder/arm/neon/av1_error_sve.c b/av1/encoder/arm/av1_error_sve.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_error_sve.c
rename to av1/encoder/arm/av1_error_sve.c
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/av1_fwd_txfm2d_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
rename to av1/encoder/arm/av1_fwd_txfm2d_neon.c
diff --git a/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_highbd_quantize_neon.c
rename to av1/encoder/arm/av1_highbd_quantize_neon.c
diff --git a/av1/encoder/arm/neon/av1_k_means_neon.c b/av1/encoder/arm/av1_k_means_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_k_means_neon.c
rename to av1/encoder/arm/av1_k_means_neon.c
diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/av1_temporal_denoiser_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
rename to av1/encoder/arm/av1_temporal_denoiser_neon.c
diff --git a/av1/encoder/arm/neon/cnn_neon.c b/av1/encoder/arm/cnn_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/cnn_neon.c
rename to av1/encoder/arm/cnn_neon.c
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/encodetxb_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/encodetxb_neon.c
rename to av1/encoder/arm/encodetxb_neon.c
diff --git a/av1/encoder/arm/crc32/hash_arm_crc32.c b/av1/encoder/arm/hash_arm_crc32.c
similarity index 100%
rename from av1/encoder/arm/crc32/hash_arm_crc32.c
rename to av1/encoder/arm/hash_arm_crc32.c
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/highbd_fwd_txfm_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
rename to av1/encoder/arm/highbd_fwd_txfm_neon.c
diff --git a/av1/encoder/arm/neon/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
similarity index 99%
rename from av1/encoder/arm/neon/highbd_pickrst_neon.c
rename to av1/encoder/arm/highbd_pickrst_neon.c
index 8b0d3bcc7e..d067a7616a 100644
--- a/av1/encoder/arm/neon/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -15,7 +15,7 @@
 
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/arm/pickrst_neon.h"
 #include "av1/encoder/pickrst.h"
 
 static INLINE void highbd_calc_proj_params_r0_r1_neon(
diff --git a/av1/encoder/arm/neon/highbd_pickrst_sve.c b/av1/encoder/arm/highbd_pickrst_sve.c
similarity index 99%
rename from av1/encoder/arm/neon/highbd_pickrst_sve.c
rename to av1/encoder/arm/highbd_pickrst_sve.c
index 3ffd6749dc..4f804c9052 100644
--- a/av1/encoder/arm/neon/highbd_pickrst_sve.c
+++ b/av1/encoder/arm/highbd_pickrst_sve.c
@@ -22,7 +22,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
-#include "av1/encoder/arm/neon/pickrst_sve.h"
+#include "av1/encoder/arm/pickrst_sve.h"
 
 static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
                                         int width, int height) {
diff --git a/av1/encoder/arm/neon/highbd_rdopt_neon.c b/av1/encoder/arm/highbd_rdopt_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/highbd_rdopt_neon.c
rename to av1/encoder/arm/highbd_rdopt_neon.c
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/highbd_temporal_filter_neon.c
rename to av1/encoder/arm/highbd_temporal_filter_neon.c
diff --git a/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
rename to av1/encoder/arm/hybrid_fwd_txfm_neon.c
diff --git a/av1/encoder/arm/neon/ml_neon.c b/av1/encoder/arm/ml_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/ml_neon.c
rename to av1/encoder/arm/ml_neon.c
diff --git a/av1/encoder/arm/neon/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
similarity index 99%
rename from av1/encoder/arm/neon/pickrst_neon.c
rename to av1/encoder/arm/pickrst_neon.c
index 2e4761f9a4..85b980c2f0 100644
--- a/av1/encoder/arm/neon/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -16,7 +16,7 @@
 
 #include "aom_dsp/arm/sum_neon.h"
 #include "av1/common/restoration.h"
-#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/arm/pickrst_neon.h"
 #include "av1/encoder/pickrst.h"
 
 int64_t av1_lowbd_pixel_proj_error_neon(
diff --git a/av1/encoder/arm/neon/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h
similarity index 97%
rename from av1/encoder/arm/neon/pickrst_neon.h
rename to av1/encoder/arm/pickrst_neon.h
index 7b72dca34d..f9683840e1 100644
--- a/av1/encoder/arm/neon/pickrst_neon.h
+++ b/av1/encoder/arm/pickrst_neon.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
-#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
 
 #include <arm_neon.h>
 
@@ -185,4 +185,4 @@ static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src,
   } while (length > 0);
 }
 
-#endif  // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#endif  // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
diff --git a/av1/encoder/arm/neon/pickrst_sve.c b/av1/encoder/arm/pickrst_sve.c
similarity index 99%
rename from av1/encoder/arm/neon/pickrst_sve.c
rename to av1/encoder/arm/pickrst_sve.c
index 88aa135e25..e865dadd41 100644
--- a/av1/encoder/arm/neon/pickrst_sve.c
+++ b/av1/encoder/arm/pickrst_sve.c
@@ -22,7 +22,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
-#include "av1/encoder/arm/neon/pickrst_sve.h"
+#include "av1/encoder/arm/pickrst_sve.h"
 
 static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
                                        int width, int height) {
diff --git a/av1/encoder/arm/neon/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h
similarity index 97%
rename from av1/encoder/arm/neon/pickrst_sve.h
rename to av1/encoder/arm/pickrst_sve.h
index ffa737611e..97f08fc61e 100644
--- a/av1/encoder/arm/neon/pickrst_sve.h
+++ b/av1/encoder/arm/pickrst_sve.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
-#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
+#ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
+#define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
 
 #include <arm_neon.h>
 #include <arm_sve.h>
@@ -148,4 +148,4 @@ static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
   }
 }
 
-#endif  // AOM_AV1_ENCODER_ARM_NEON_PICKRST_SVE_H_
+#endif  // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/quantize_neon.c
rename to av1/encoder/arm/quantize_neon.c
diff --git a/av1/encoder/arm/neon/rdopt_neon.c b/av1/encoder/arm/rdopt_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/rdopt_neon.c
rename to av1/encoder/arm/rdopt_neon.c
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/reconinter_enc_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/reconinter_enc_neon.c
rename to av1/encoder/arm/reconinter_enc_neon.c
diff --git a/av1/encoder/arm/neon/shift_neon.h b/av1/encoder/arm/shift_neon.h
similarity index 93%
rename from av1/encoder/arm/neon/shift_neon.h
rename to av1/encoder/arm/shift_neon.h
index d73aef2f25..ad9fd9c671 100644
--- a/av1/encoder/arm/neon/shift_neon.h
+++ b/av1/encoder/arm/shift_neon.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
-#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#ifndef AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_
 
 #include <arm_neon.h>
 
@@ -46,4 +46,4 @@ SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
 
 #undef SHIFT_LOOP_HELPER
 
-#endif  // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#endif  // AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/temporal_filter_neon.c
rename to av1/encoder/arm/temporal_filter_neon.c
diff --git a/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
similarity index 100%
rename from av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
rename to av1/encoder/arm/temporal_filter_neon_dotprod.c
diff --git a/av1/encoder/arm/neon/txfm_neon.h b/av1/encoder/arm/txfm_neon.h
similarity index 86%
rename from av1/encoder/arm/neon/txfm_neon.h
rename to av1/encoder/arm/txfm_neon.h
index 635364f46a..8b07dfb613 100644
--- a/av1/encoder/arm/neon/txfm_neon.h
+++ b/av1/encoder/arm/txfm_neon.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
-#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#ifndef AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
 
 #include "aom/aom_integer.h"  // For AOM_INLINE.
 
@@ -23,4 +23,4 @@ static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
   }
 }
 
-#endif  // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#endif  // AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
diff --git a/av1/encoder/arm/neon/wedge_utils_neon.c b/av1/encoder/arm/wedge_utils_neon.c
similarity index 100%
rename from av1/encoder/arm/neon/wedge_utils_neon.c
rename to av1/encoder/arm/wedge_utils_neon.c
diff --git a/av1/encoder/arm/neon/wedge_utils_sve.c b/av1/encoder/arm/wedge_utils_sve.c
similarity index 100%
rename from av1/encoder/arm/neon/wedge_utils_sve.c
rename to av1/encoder/arm/wedge_utils_sve.c
-- 
GitLab


From 4058a1559205bf3671d888fb8ad7884bb30c811c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 19 Apr 2024 19:38:34 -0700
Subject: [PATCH 074/391] add av1/common/debugmodes.h

This collects the prototypes for the functions defined in debugmodes.c
and fixes some missing prototypes warnings.

+ Comment out some unused/debug-only code in debugmodes.c for the same
  reason.

Bug: aomedia:3416
Change-Id: Ibaaba25338123b7217f6744daa4fbd91e70f3687
---
 av1/common/debugmodes.c |  5 +++++
 av1/common/debugmodes.h | 24 ++++++++++++++++++++++++
 av1/encoder/bitstream.c |  4 +---
 av1/encoder/encoder.c   |  4 +---
 4 files changed, 31 insertions(+), 6 deletions(-)
 create mode 100644 av1/common/debugmodes.h

diff --git a/av1/common/debugmodes.c b/av1/common/debugmodes.c
index 7e6160f9a5..e67cf04a3f 100644
--- a/av1/common/debugmodes.c
+++ b/av1/common/debugmodes.c
@@ -9,17 +9,21 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/debugmodes.h"
+
 #include <stdio.h>
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 
+#if 0
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
   fprintf(f, "%s", str);
   fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
           cm->show_frame, cm->quant_params.base_qindex);
 }
+
 /* This function dereferences a pointer to the mbmi structure
  * and uses the passed in member offset to print out the value of an integer
  * for each mbmi member value in the mi structure.
@@ -87,6 +91,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
 
   fclose(mvs);
 }
+#endif  // 0
 
 void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                          const char *filename) {
diff --git a/av1/common/debugmodes.h b/av1/common/debugmodes.h
new file mode 100644
index 0000000000..8f3a91cf46
--- /dev/null
+++ b/av1/common/debugmodes.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_DEBUGMODES_H_
+#define AOM_AV1_COMMON_DEBUGMODES_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file);
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename);
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename);
+
+#endif  // AOM_AV1_COMMON_DEBUGMODES_H_
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 9981871147..163b62c77c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -26,6 +26,7 @@
 
 #include "av1/common/cdef.h"
 #include "av1/common/cfl.h"
+#include "av1/common/debugmodes.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
@@ -3521,9 +3522,6 @@ static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
   return size;
 }
 
-extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
-                                                const char *filename);
-
 typedef struct {
   uint32_t tg_hdr_size;
   uint32_t frame_header_size;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 1ddbfda08b..85d980bae9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -41,6 +41,7 @@
 #endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "av1/common/alloccommon.h"
+#include "av1/common/debugmodes.h"
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
@@ -3553,9 +3554,6 @@ static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
 }
 #endif
 
-extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
-                                     const char *filename);
-
 /*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
  * the bitstream
  *
-- 
GitLab


From 113ff4025b758ded9ebe2adda558d71b38eb786f Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Wed, 13 Sep 2023 13:17:23 -0400
Subject: [PATCH 075/391] cmake: win: fix asm flag appending

These were evaluating to e.g.:
aarch64-w64-mingw32-gcc;-c;-mimplicit-it=always

which broke the detection of the actual asm executable's presence.

Change-Id: I6ce4411a3ea81e6928c02399d84ecbc3f58db6df
---
 build/cmake/aom_configure.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 304d90d1e1..ac3e1325b3 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -190,7 +190,7 @@ elseif(AOM_TARGET_CPU MATCHES "arm")
     set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
   elseif(AOM_TARGET_SYSTEM STREQUAL "Windows")
     if(NOT CMAKE_ASM_COMPILER)
-      set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
+      set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} "-c -mimplicit-it=always")
     endif()
   else()
     if(NOT CMAKE_ASM_COMPILER)
-- 
GitLab


From ac251466f576a695a10d468043eb663fd9b44d6b Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 24 Apr 2024 12:09:36 -0700
Subject: [PATCH 076/391] Check for division by zero in
 update_temporal_layer_framerate

If the lc->framerate_factor are not set as expected for
temporal layers, divide by zero can happen for
lc->avg_frame_size. Check for this and reset the value.

Change-Id: I65b383624dca4cb93ebf77f83030bcbfda5d2da1
---
 av1/encoder/svc_layercontext.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 33da3afbd3..dbab1d54c9 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -197,9 +197,13 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
     const double prev_layer_framerate =
         cpi->framerate / lcprev->framerate_factor;
     const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
-    lc->avg_frame_size =
-        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
-                   (lc->framerate - prev_layer_framerate));
+    if (lc->framerate > prev_layer_framerate) {
+      lc->avg_frame_size =
+          (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                     (lc->framerate - prev_layer_framerate));
+    } else {
+      lc->avg_frame_size = (int)round(lc->target_bandwidth / lc->framerate);
+    }
   }
 }
 
-- 
GitLab


From 940a43649989921e6af6e080fdf9f250a9fc9673 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 23 Apr 2024 14:43:56 +0100
Subject: [PATCH 077/391] Add 4-tap specialization to av1_convolve_x_sr_neon

Add specialised path for 4-tap filters in av1_convolve_x_sr_neon. This
gives between 30 and 50% upflit compared to using the 8-tap path.

Change-Id: Icebf4df5a184bb6290b32c21c15ea91809c3763c
---
 av1/common/arm/convolve_neon.c | 337 +++++++++++++++++++--------------
 1 file changed, 198 insertions(+), 139 deletions(-)

diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 10442f9bf9..c86215e64d 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -188,18 +188,95 @@ static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
 #endif  // AOM_ARCH_AARCH64
 }
 
-static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
-                                      const int16x4_t s2, const int16x4_t s3,
+static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
                                       const int16x4_t filter,
-                                      const int16x4_t horiz_const) {
-  int16x4_t sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
+                                      int16x8_t horiz_const) {
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
 
-  // We halved the convolution filter values so - 1 from the right shift.
-  return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
+static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
+                                           int src_stride, uint8_t *dst_ptr,
+                                           const int dst_stride, int w, int h,
+                                           const int16_t *x_filter_ptr) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+  // rounding right shift by FILTER_BITS - instead of a first rounding right
+  // shift by ROUND0_BITS, followed by second rounding right shift by
+  // FILTER_BITS - ROUND0_BITS.
+  // The outermost -1 is needed because we will halve the filter values.
+  const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
+
+  if (w == 4) {
+    do {
+      uint8x8_t t01[4];
+      t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
+      t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
+      t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
+      t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
+
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
+
+      uint8x8_t d01 =
+          convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      do {
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
+
+        int16x8_t s0[4], s1[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
+
+        uint8x8_t d0 =
+            convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+        uint8x8_t d1 =
+            convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
+
+        store_u8_8x2(d, dst_stride, d0, d1);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  }
 }
 
 static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
@@ -242,12 +319,20 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
-  if (filter_params_x->taps > 8) {
+  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_taps > 8) {
     convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
                              x_filter_ptr);
     return;
   }
 
+  if (filter_taps <= 4) {
+    convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
+                            x_filter_ptr);
+    return;
+  }
+
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
   // rounding right shift by FILTER_BITS - instead of a first rounding right
   // shift by ROUND0_BITS, followed by second rounding right shift by
@@ -255,149 +340,123 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   // The outermost -1 is needed because we will halve the filter values.
   const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
 
-  if (w <= 4) {
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
-
-    src += 2;
-
-    do {
-      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-
-      uint8x8_t d0 =
-          convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
-
-      store_u8_4x1(dst, d0);
-
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    // Filter values are even so halve to reduce precision requirements.
-    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Filter values are even so halve to reduce precision requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
 #if AOM_ARCH_AARCH64
-    while (h >= 8) {
-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+  while (h >= 8) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-      int width = w;
-      const uint8_t *s = src + 7;
-      uint8_t *d = dst;
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-      __builtin_prefetch(d + 4 * dst_stride);
-      __builtin_prefetch(d + 5 * dst_stride);
-      __builtin_prefetch(d + 6 * dst_stride);
-      __builtin_prefetch(d + 7 * dst_stride);
-
-      do {
-        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+    int width = w;
+    const uint8_t *s = src + 7;
+    uint8_t *d = dst;
+
+    __builtin_prefetch(d + 0 * dst_stride);
+    __builtin_prefetch(d + 1 * dst_stride);
+    __builtin_prefetch(d + 2 * dst_stride);
+    __builtin_prefetch(d + 3 * dst_stride);
+    __builtin_prefetch(d + 4 * dst_stride);
+    __builtin_prefetch(d + 5 * dst_stride);
+    __builtin_prefetch(d + 6 * dst_stride);
+    __builtin_prefetch(d + 7 * dst_stride);
 
-        transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
-                                       &t14);
-        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
-        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
-        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
-        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
-        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
-        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
-        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+    do {
+      uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+      load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
 
-        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                     horiz_const);
-        uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                     horiz_const);
-        uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                     horiz_const);
-        uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                     horiz_const);
-        uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                                     horiz_const);
-        uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
-                                     x_filter, horiz_const);
-        uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
-                                     x_filter, horiz_const);
-        uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
-                                     x_filter, horiz_const);
-
-        transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+      transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
+                                     &t14);
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
 
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 8 * src_stride;
-      dst += 8 * dst_stride;
-      h -= 8;
-    }
+      uint8x8_t d0 =
+          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
+      uint8x8_t d1 =
+          convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
+      uint8x8_t d2 =
+          convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
+      uint8x8_t d3 =
+          convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
+      uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   horiz_const);
+      uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   horiz_const);
+      uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   horiz_const);
+      uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, horiz_const);
+
+      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+      s0 = s8;
+      s1 = s9;
+      s2 = s10;
+      s3 = s11;
+      s4 = s12;
+      s5 = s13;
+      s6 = s14;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  }
 #endif  // AOM_ARCH_AARCH64
 
-    while (h-- != 0) {
-      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+  while (h-- != 0) {
+    uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
 
-      int width = w;
-      const uint8_t *s = src + 8;
-      uint8_t *d = dst;
+    int width = w;
+    const uint8_t *s = src + 8;
+    uint8_t *d = dst;
 
-      __builtin_prefetch(d);
+    __builtin_prefetch(d);
 
-      do {
-        uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+    do {
+      uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
 
-        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                     horiz_const);
+      uint8x8_t d0 =
+          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
 
-        vst1_u8(d, d0);
+      vst1_u8(d, d0);
 
-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += src_stride;
-      dst += dst_stride;
-    }
+      s0 = s8;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
-- 
GitLab


From 130150255a874f306c9f5850937d0062c384e62c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 23 Apr 2024 16:38:24 +0100
Subject: [PATCH 078/391] Add 4-tap specialisation to
 av1_convolve_x_sr_neon_dotprod

Add specialised path for 4-tap filters in
av1_convolve_x_sr_neon_dotprod. This gives between 20% and 30% uplift
compared to using the 8-tap path.

Change-Id: If1fcfa73261ce3cc8d863e732f507b586105a883
---
 av1/common/arm/convolve_neon_dotprod.c | 229 ++++++++++++++++---------
 1 file changed, 150 insertions(+), 79 deletions(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 132da2442b..d670657f84 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -21,7 +21,7 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
@@ -109,7 +109,7 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   int32x4_t correction =
       vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1)));
   const uint8x16_t range_limit = vdupq_n_u8(128);
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
   // Special case the following no-op filter as 128 won't fit into the
   // 8-bit signed dot-product instruction:
@@ -197,25 +197,123 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
-                                      const int32x4_t correction,
-                                      const uint8x16_t range_limit,
+static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
+                                      const int8x8_t filters,
                                       const uint8x16_t permute_tbl) {
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  int8x16_t clamped_samples =
-      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0);
+  // Dot product constants:
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  int32x4_t acc =
+      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
 
-  // Packing is performed by the caller.
+  // Further narrowing and packing is performed by the caller.
   return vmovn_s32(sum);
 }
 
+static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Dot product constants:
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  int32x4_t acc =
+      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_x_sr_4tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int w = width;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
 static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
                                       const int32x4_t correction,
                                       const uint8x16_t range_limit,
@@ -265,12 +363,20 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
-  if (filter_params_x->taps > 8) {
+  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_taps > 8) {
     convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
                                      x_filter_ptr);
     return;
   }
 
+  if (filter_taps <= 4) {
+    convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h,
+                                    x_filter_ptr);
+    return;
+  }
+
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
   // Dot product constants:
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
@@ -281,73 +387,38 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
       vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
   const uint8x16_t range_limit = vdupq_n_u8(128);
 
-  if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
 
-    src += 2;
+  do {
+    int width = w;
+    const uint8_t *s = src;
+    uint8_t *d = dst;
 
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      int16x4_t d0 =
-          convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d1 =
-          convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d2 =
-          convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d3 =
-          convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-      // We halved the convolution filter values so - 1 from the right shift.
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-
-    do {
-      int width = w;
-      const uint8_t *s = src;
-      uint8_t *d = dst;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        uint8x8_t d0 =
-            convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
-        uint8x8_t d1 =
-            convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
-        uint8x8_t d2 =
-            convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
-        uint8x8_t d3 =
-            convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint8x8_t d0 =
+          convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+      uint8x8_t d1 =
+          convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+      uint8x8_t d2 =
+          convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+      uint8x8_t d3 =
+          convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
 }
 
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
@@ -468,7 +539,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
     const int32x4_t correction =
         vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
     const uint8x16_t range_limit = vdupq_n_u8(128);
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
     if (w <= 4) {
       do {
@@ -630,7 +701,7 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod(
   int height = im_h;
 
   if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
     // 4-tap filters are used for blocks having width <= 4.
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter =
@@ -668,7 +739,7 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod(
       dst_ptr += dst_stride;
     } while (--height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
 
-- 
GitLab


From b2251797c38cb0235483605439efb36e12250f01 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 23 Apr 2024 16:52:21 +0100
Subject: [PATCH 079/391] Add 4-tap specialisation to
 av1_convolve_x_sr_neon_i8mm

Add specialised path for 4-tap filters in
av1_convolve_x_sr_neon_i8mm. This gives between 20% and 30% uplift
compared to using the 8-tap path.

Change-Id: Iabea03a65cb91963b5aca9bdd519473412cfa355
---
 av1/common/arm/convolve_neon_i8mm.c | 202 ++++++++++++++++++----------
 1 file changed, 134 insertions(+), 68 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index bbcd6f201a..3fe2c98d6b 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -21,7 +21,7 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
@@ -120,7 +120,7 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
       dst += dst_stride;
     } while (--h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
     // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
     // right shift by FILTER_BITS - instead of a first rounding right shift by
     // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
@@ -177,20 +177,110 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
   }
 }
 
-static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
-                                      const uint8x16_t permute_tbl,
-                                      const int32x4_t horiz_const) {
+static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  // First 4 output values.
-  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0);
+  // Dot product constants:
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
+  int32x4_t sum = vusdotq_lane_s32(acc, permuted_samples, filters, 0);
 
-  // Packing is performed by the caller.
+  // Further narrowing and packing is performed by the caller.
   return vmovn_s32(sum);
 }
 
+static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // Dot product constants:
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
+
+  // First 4 output values.
+  int32x4_t sum0 = vusdotq_lane_s32(acc, permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vusdotq_lane_s32(acc, permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_x_sr_4tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_x(s0, filter, perm_tbl);
+      int16x4_t t1 = convolve4_4_x(s1, filter, perm_tbl);
+      int16x4_t t2 = convolve4_4_x(s2, filter, perm_tbl);
+      int16x4_t t3 = convolve4_4_x(s3, filter, perm_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      int w = width;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_x(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve4_8_x(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve4_8_x(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve4_8_x(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
 static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
                                       const uint8x16x3_t permute_tbl,
                                       const int32x4_t horiz_const) {
@@ -234,12 +324,20 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
-  if (filter_params_x->taps > 8) {
+  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_taps > 8) {
     convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
                                   x_filter_ptr);
     return;
   }
 
+  if (filter_taps <= 4) {
+    convolve_x_sr_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, w, h,
+                                 x_filter_ptr);
+    return;
+  }
+
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
   // rounding right shift by FILTER_BITS - instead of a first rounding right
   // shift by ROUND0_BITS, followed by second rounding right shift by
@@ -247,66 +345,34 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
   // The outermost -1 is needed because we will halve the filter values.
   const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
 
-  if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-    src += 2;
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
 
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const);
-      int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const);
-      int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const);
-      int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const);
-
-      // We halved the convolution filter values so - 1 from the right shift.
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-    do {
-      const uint8_t *s = src;
-      uint8_t *d = dst;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
-        uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
-        uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
-        uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  }
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
 }
 
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
@@ -411,7 +477,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
     // - which are generally faster than rounding shifts on modern CPUs.
     const int32x4_t horiz_const =
         vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
     if (w <= 4) {
       do {
@@ -559,7 +625,7 @@ static INLINE void convolve_2d_sr_horiz_neon_i8mm(
   int height = im_h;
 
   if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
     // 4-tap filters are used for blocks having width <= 4.
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter =
@@ -592,7 +658,7 @@ static INLINE void convolve_2d_sr_horiz_neon_i8mm(
       dst_ptr += dst_stride;
     } while (--height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-- 
GitLab


From 90572597b4e21ef8f7cc23a360770fdbcb09e150 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 24 Apr 2024 14:13:47 +0100
Subject: [PATCH 080/391] Add 4-tap specialisation to av1_convolve_y_sr_neon

Add specialised path for 4-tap filters in av1_convolve_y_sr_neon. This
gives between 20% and 40% uplift compared to using the 6-tap path.

Change-Id: I2769e58db2ab1bbfd4e69c03b74d75bd3b920ee7
---
 av1/common/arm/convolve_neon.c | 104 ++++++++++++++++++++++++++++++++-
 1 file changed, 102 insertions(+), 2 deletions(-)

diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index c86215e64d..bd11b7cf29 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -460,6 +460,103 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
+static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
+                                           const int src_stride, uint8_t *dst,
+                                           const int dst_stride, int w, int h,
+                                           const int16_t *filter_y) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
+      uint8_t *d = dst;
+
+      do {
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
@@ -1033,7 +1130,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   const int vert_offset = clamped_y_taps / 2 - 1;
 
   src -= vert_offset * src_stride;
@@ -1050,7 +1147,10 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   // Filter values are even so halve to reduce precision requirements.
   const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
-  if (y_filter_taps < 8) {
+  if (y_filter_taps <= 4) {
+    convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
+                            y_filter_ptr);
+  } else if (y_filter_taps == 6) {
     convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
   } else {
     convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
-- 
GitLab


From ae37acecb4cdc8ddcedf0204e1fda4c0204a4f9a Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 18 Apr 2024 14:04:00 +0100
Subject: [PATCH 081/391] Further optimization of aom_convolve8_neon 4-tap

Use the full vector bandwidth for 4xh blocks rather than half of it and
load more vectors instead of loading a few and using VEXT. This gives up
to 10% uplift.

Change-Id: Icdad0952229f647d9dd6f6baafd0852d1a15ccc4
---
 aom_dsp/arm/aom_convolve8_neon.c | 77 +++++++++++---------------------
 1 file changed, 25 insertions(+), 52 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 9a3ff8079c..193844d06c 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -232,17 +232,6 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t filter) {
-  int16x4_t sum = vmul_lane_s16(s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
-
-  return sum;
-}
-
 static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
@@ -254,26 +243,20 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
 
   if (w == 4) {
     do {
-      int16x8_t t0 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride)));
-      int16x8_t t1 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride)));
-
-      int16x4_t s0[4], s1[4];
-      s0[0] = vget_low_s16(t0);
-      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
-      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
-      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
-
-      s1[0] = vget_low_s16(t1);
-      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
-      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
-      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
-
-      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
-      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
-      // We halved the filter values so -1 from right shift.
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t t01[4];
+
+      t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
+      t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
+      t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
+      t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
+
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
+
+      uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
 
@@ -287,37 +270,27 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
       const uint8_t *s = src;
       uint8_t *d = dst;
 
-      int16x8_t t0 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
-      int16x8_t t1 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
-
-      s += 8;
       do {
-        int16x8_t t2 =
-            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
-        int16x8_t t3 =
-            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
 
         int16x8_t s0[4], s1[4];
-        s0[0] = t0;
-        s0[1] = vextq_s16(t0, t2, 1);
-        s0[2] = vextq_s16(t0, t2, 2);
-        s0[3] = vextq_s16(t0, t2, 3);
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
 
-        s1[0] = t1;
-        s1[1] = vextq_s16(t1, t3, 1);
-        s1[2] = vextq_s16(t1, t3, 2);
-        s1[3] = vextq_s16(t1, t3, 3);
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
 
         uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
         uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
 
         store_u8_8x2(d, dst_stride, d0, d1);
 
-        t0 = t2;
-        t1 = t3;
-
         s += 8;
         d += 8;
         width -= 8;
-- 
GitLab


From f5201a1266e3faebd6ac0e3db14820cb4be7b0c3 Mon Sep 17 00:00:00 2001
From: Mudassir Galaganath <mudassir.galaganath@ittiam.com>
Date: Wed, 24 Apr 2024 11:41:18 +0530
Subject: [PATCH 082/391] Add SSE2 for resize_vert_dir()

This CL adds SSE2 implementation for resize_vert_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160              4.47x
 2560x1440              5.16x
 1920x1080              5.27x
 1280x720               5.83x
 640x480                6.16x
 640x360                6.55x
 256x256                7.69x

This is a bit-exact change.

Change-Id: I23ade35421ff0aff63d2f0be2fafbad5b6f699c3
---
 av1/av1.cmake                |   1 +
 av1/common/av1_rtcd_defs.pl  |   2 +-
 av1/common/x86/resize_sse2.c | 165 +++++++++++++++++++++++++++++++++++
 test/frame_resize_test.cc    |   7 ++
 4 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 av1/common/x86/resize_sse2.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index c057856733..dcc19b70d4 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -266,6 +266,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/resize_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7d917eb8b1..3973d919bd 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -555,7 +555,7 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 }
 
 add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir avx2/;
+specialize qw/resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
new file mode 100644
index 0000000000..9714ecf776
--- /dev/null
+++ b/av1/common/x86/resize_sse2.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define PROCESS_RESIZE_Y_WD8                                           \
+  /* ah0 ah1 ... ah7 */                                                \
+  const __m128i AH = _mm_add_epi16(l0, l7);                            \
+  /* bg0 bg1 ... bh7 */                                                \
+  const __m128i BG = _mm_add_epi16(l1, l6);                            \
+  /* cf0 cf1 ... cf7 */                                                \
+  const __m128i CF = _mm_add_epi16(l2, l5);                            \
+  /* de0 de1 ... de7 */                                                \
+  const __m128i DE = _mm_add_epi16(l3, l4);                            \
+                                                                       \
+  /* ah0 bg0 ... ah3 bg3 */                                            \
+  const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG);                 \
+  /*cf0 de0 ... cf2 de2 */                                             \
+  const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE);                 \
+                                                                       \
+  /* ah4 bg4... ah7 bg7 */                                             \
+  const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG);                  \
+  /* cf4 de4... cf7 de7 */                                             \
+  const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE);                  \
+                                                                       \
+  /* r00 r01 r02 r03 */                                                \
+  const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]);           \
+  const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]);           \
+  __m128i r0 = _mm_add_epi32(r00, r01);                                \
+  /* r04 r05 r06 r07 */                                                \
+  const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]);            \
+  const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]);            \
+  __m128i r1 = _mm_add_epi32(r10, r11);                                \
+                                                                       \
+  r0 = _mm_add_epi32(r0, round_const_bits);                            \
+  r1 = _mm_add_epi32(r1, round_const_bits);                            \
+  r0 = _mm_sra_epi32(r0, round_shift_bits);                            \
+  r1 = _mm_sra_epi32(r1, round_shift_bits);                            \
+                                                                       \
+  /* r00 ... r07 (8 values of each 16bit) */                           \
+  const __m128i res_16b = _mm_packs_epi32(r0, r1);                     \
+  /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */             \
+  const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b);          \
+                                                                       \
+  __m128i res = _mm_min_epu8(res_8b0, clip_pixel);                     \
+  res = _mm_max_epu8(res, zero);                                       \
+  _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
+                                                                       \
+  l0 = l2;                                                             \
+  l1 = l3;                                                             \
+  l2 = l4;                                                             \
+  l3 = l5;                                                             \
+  l4 = l6;                                                             \
+  l5 = l7;                                                             \
+  data += 2 * stride;
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+                                         __m128i *const coeffs /* [2] */) {
+  // f0 f1 f2 f3 x x x x
+  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+
+  // f1 f0 f3 f2 x x x x
+  const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
+
+  // f3 f2 f3 f2 ...
+  coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
+  // f1 f0 f1 f0 ...
+  coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
+}
+
+bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                          int height, int height2, int stride, int start_col) {
+  // For the GM tool, the input layer height or width is assured to be an even
+  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+  // optimization of the same is not implemented.
+  // When the input height is less than 8 and even, the potential input
+  // heights are limited to 2, 4, or 6. These scenarios require seperate
+  // handling due to padding requirements. Invoking the C function here will
+  // eliminate the need for conditional statements within the subsequent SIMD
+  // code to manage these cases.
+  if (height & 1 || height < 8) {
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, start_col);
+  }
+
+  __m128i coeffs_y[2];
+  const int bits = FILTER_BITS;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+  const int remain_col = stride % 8;
+
+  for (int j = start_col; j < stride - remain_col; j += 8) {
+    uint8_t *data = &intbuf[j];
+    // d0 ... d7
+    const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with the last available row at the top.
+    // a0 ... a7
+    const __m128i l8_0 = l8_3;
+    // b0 ... b7
+    const __m128i l8_1 = l8_3;
+    // c0 ... c7
+    const __m128i l8_2 = l8_3;
+    // e0 ... e7
+    const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+    // f0 ... f7
+    const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+
+    // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
+    __m128i l0 = _mm_unpacklo_epi8(l8_0, zero);  // A(128bit) = a0 - a7(16 bit)
+    __m128i l1 = _mm_unpacklo_epi8(l8_1, zero);  // B(128bit) = b0 - b7(16 bit)
+    __m128i l2 = _mm_unpacklo_epi8(l8_2, zero);  // C(128bit) = c0 - c7(16 bit)
+    __m128i l3 = _mm_unpacklo_epi8(l8_3, zero);  // D(128bit) = d0 - d7(16 bit)
+    __m128i l4 = _mm_unpacklo_epi8(l8_4, zero);  // E(128bit) = e0 - e7(16 bit)
+    __m128i l5 = _mm_unpacklo_epi8(l8_5, zero);  // F(128bit) = f0 - f7(16 bit)
+
+    // Increment the pointer such that the loading starts from row G.
+    data = data + 3 * stride;
+    // The core vertical SIMD processes 2 input rows simultaneously to generate
+    // output corresponding to 1 row. To streamline the core loop and eliminate
+    // the need for conditional checks, the remaining rows 4 are processed
+    // separately.
+    for (int i = 0; i < height - 4; i += 2) {
+      // g0 ... g7
+      __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+      // h0 ... h7
+      __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
+      __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);  // G(128bit):g0-g7(16b)
+      __m128i l7 = _mm_unpacklo_epi8(l8_7, zero);  // H(128bit):h0-h7(16b)
+
+      PROCESS_RESIZE_Y_WD8
+    }
+
+    __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+    __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
+    // Process the last 4 input rows here.
+    for (int i = height - 4; i < height; i += 2) {
+      __m128i l7 = l6;
+      PROCESS_RESIZE_Y_WD8
+    }
+  }
+
+  if (remain_col)
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, stride - remain_col);
+
+  return true;
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 8891304192..cab6fe354d 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -154,4 +154,11 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kFrameDim)));
 #endif
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1ResizeYTest,
+    ::testing::Combine(::testing::Values(resize_vert_dir_sse2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
 }  // namespace
-- 
GitLab


From 4073590b26a546add973fb0f455b3c46d165c5d7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Apr 2024 11:30:14 -0700
Subject: [PATCH 083/391] resize.c: make resize_horz_dir static

fixes -Wmissing-protoypes warning

Bug: aomedia:3416
Change-Id: I46a1ac07200fb1df214ab1ed71d0e4fcbf267e8e
---
 av1/common/resize.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/av1/common/resize.c b/av1/common/resize.c
index 2b48b9fff4..727f84fdbb 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -543,8 +543,9 @@ Error:
   return mem_status;
 }
 
-void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
-                     int height, int filtered_length, int width2) {
+static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
+                                   uint8_t *intbuf, int height,
+                                   int filtered_length, int width2) {
   for (int i = 0; i < height; ++i)
     down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
 }
-- 
GitLab


From ad5fd34ad9058384a55196f66e2001cc8c2c523f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 29 Apr 2024 13:29:14 -0700
Subject: [PATCH 084/391] Ensure thread stack size is at least 256 KB

Fixes cases like musl where the default is lower:
https://wiki.musl-libc.org/functional-differences-from-glibc.html#Thread-stack-size

Bug: aomedia:2754, aomedia:3567
Change-Id: Ia6e211f9b87bc2efe376e7b9f4adb11741850b18
---
 aom_util/aom_thread.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c
index bdf2b7dfa6..783ffac32f 100644
--- a/aom_util/aom_thread.c
+++ b/aom_util/aom_thread.c
@@ -156,16 +156,18 @@ static int reset(AVxWorker *const worker) {
       // See: https://crbug.com/aomedia/3379
 #if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
     !defined(NDEBUG)
+    const size_t kMinStackSize = 1024 * 1024;
+#else
+    const size_t kMinStackSize = 256 * 1024;
+#endif
     size_t stacksize;
     if (!pthread_attr_getstacksize(&attr, &stacksize)) {
-      const size_t kMinStackSize = 1 << 20;  // 1 MiB
       if (stacksize < kMinStackSize &&
           pthread_attr_setstacksize(&attr, kMinStackSize)) {
         pthread_attr_destroy(&attr);
         goto Error2;
       }
     }
-#endif
     pthread_mutex_lock(&worker->impl_->mutex_);
     ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
     if (ok) worker->status_ = AVX_WORKER_STATUS_OK;
-- 
GitLab


From 2a3f8ed51db1e5885acd83d0f17d431e6641f4d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Apr 2024 11:31:59 -0700
Subject: [PATCH 085/391] cfl: add missing prototypes

fixes -Wmissing-prototypes warning

Bug: aomedia:3416
Change-Id: I7661fce3f4c2ccaaef59f144030079dd5c126a13
---
 av1/common/cfl.c         |  5 +++--
 av1/common/cfl.h         | 27 +++++++++++++++------------
 av1/common/ppc/cfl_ppc.c |  4 ++++
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 0e37d45980..bd11c4a6a0 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -159,8 +159,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
 CFL_PREDICT_FN(c, lbd)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
-                       int alpha_q3, int bit_depth, int width, int height) {
+static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
+                                     int dst_stride, int alpha_q3,
+                                     int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] = clip_pixel_highbd(
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index dcaa87bd48..dbb94d665b 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -95,6 +95,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3);      \
   void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
       const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
     cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
@@ -170,6 +172,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst);       \
   void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
                                                         int16_t *dst) {      \
     subtract_average_##arch(src, dst, width, height, round_offset,           \
@@ -220,22 +224,21 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
     return sub_avg[tx_size % TX_SIZES_ALL];                               \
   }
 
-// For VSX SIMD optimization, the C versions of width == 4 subtract are
-// faster than the VSX. As such, the VSX code calls the C versions.
-void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
-
-#define CFL_PREDICT_lbd(arch, width, height)                              \
-  void cfl_predict_lbd_##width##x##height##_##arch(                       \
-      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
-      int alpha_q3) {                                                     \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
-                           height);                                       \
+#define CFL_PREDICT_lbd(arch, width, height)                                   \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,                \
+      int alpha_q3) {                                                          \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
+                           height);                                            \
   }
 
 #if CONFIG_AV1_HIGHBITDEPTH
 #define CFL_PREDICT_hbd(arch, width, height)                                   \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd);                                                                 \
   void cfl_predict_hbd_##width##x##height##_##arch(                            \
       const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
       int bd) {                                                                \
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c
index 6f88768f2f..27a7f07a0d 100644
--- a/av1/common/ppc/cfl_ppc.c
+++ b/av1/common/ppc/cfl_ppc.c
@@ -124,6 +124,10 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
 
 // Based on observation, for small blocks VSX does not outperform C (no 64bit
 // load and store intrinsics). So we call the C code for block widths 4.
+extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
 cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
     cfl_subtract_average_4x4_c,     /* 4x4 */
-- 
GitLab


From 8cbb78176c4f9247dd232492f9e29f130ab40a58 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Apr 2024 12:11:47 -0700
Subject: [PATCH 086/391] merge aom_asm_stubs.c and highbd_convolve_sse2.c

The allows the functions in highbd_convolve_sse2.c to be made static.
This fixes some -Wmissing-prototypes warnings.

This change is similar to what was done in libvpx:
c67a2e76a subpixel_8t sse2: resolve missing declarations

Bug: aomedia:3416
Change-Id: I473da99c88edfec47ca5b3384a74d3f076b565d9
---
 aom_dsp/aom_dsp.cmake                     |  1 -
 aom_dsp/x86/aom_asm_stubs.c               | 61 ---------------
 aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 90 ++++++++++++-----------
 aom_dsp/x86/highbd_convolve_sse2.c        | 79 ++++++++++++++------
 4 files changed, 103 insertions(+), 128 deletions(-)
 delete mode 100644 aom_dsp/x86/aom_asm_stubs.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 27099d36b2..6d8e5a961b 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -58,7 +58,6 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
deleted file mode 100644
index 6c7fdd6eb1..0000000000
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-#if CONFIG_AV1_HIGHBITDEPTH
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-#endif
-#endif  // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index d392225906..f84f8fa1f7 100644
--- a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -202,14 +202,15 @@
 
 SECTION .text
 
-;void aom_filter_block1d4_v8_sse2
+;void aom_highbd_filter_block1d4_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_v8_sse2)
 sym(aom_highbd_filter_block1d4_v8_sse2):
@@ -272,14 +273,15 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d8_v8_sse2
+;void aom_highbd_filter_block1d8_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_v8_sse2)
 sym(aom_highbd_filter_block1d8_v8_sse2):
@@ -331,14 +333,15 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d16_v8_sse2
+;void aom_highbd_filter_block1d16_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_v8_sse2)
 sym(aom_highbd_filter_block1d16_v8_sse2):
@@ -394,14 +397,15 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d4_h8_sse2
+;void aom_highbd_filter_block1d4_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_h8_sse2)
 sym(aom_highbd_filter_block1d4_h8_sse2):
@@ -469,14 +473,15 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d8_h8_sse2
+;void aom_highbd_filter_block1d8_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_h8_sse2)
 sym(aom_highbd_filter_block1d8_h8_sse2):
@@ -535,14 +540,15 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d16_h8_sse2
+;void aom_highbd_filter_block1d16_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_h8_sse2)
 sym(aom_highbd_filter_block1d16_h8_sse2):
diff --git a/aom_dsp/x86/highbd_convolve_sse2.c b/aom_dsp/x86/highbd_convolve_sse2.c
index a2bb283222..40201aa193 100644
--- a/aom_dsp/x86/highbd_convolve_sse2.c
+++ b/aom_dsp/x86/highbd_convolve_sse2.c
@@ -15,10 +15,9 @@
 
 // -----------------------------------------------------------------------------
 
-void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
   __m128i srcReg23_lo, srcReg34_lo;
@@ -101,10 +100,9 @@ void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i addFilterReg64;
   __m128i secondFilters, thirdFilters;
@@ -153,10 +151,9 @@ void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
@@ -262,10 +259,9 @@ void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i addFilterReg64;
   __m128i secondFilters, thirdFilters;
@@ -330,22 +326,57 @@ void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                      height, filter, bd);
   aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                      dst_pitch, height, filter, bd);
 }
 
-void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                      height, filter, bd);
   aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                      dst_pitch, height, filter, bd);
 }
+
+// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-- 
GitLab


From b736e96c15b3efe643a82d394b20c6d44fd225b6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 30 Apr 2024 12:52:42 -0700
Subject: [PATCH 087/391] noise_model_test.cc: fix -Wc++20-extensions warning

Add an empty fourth argument to INSTANTIATE_TYPED_TEST_SUITE_P().

Fixes:
  aom/test/noise_model_test.cc:536:49: warning: passing no argument for
  the '...' parameter of a variadic macro is a C++20 extension

Change-Id: Id1457ad67a101502f6b811eacfaf483dacd27848
---
 test/noise_model_test.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index b3edcc218e..87f607c155 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -532,8 +532,10 @@ typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
                          BitDepthParams<uint16_t, 10, true>,  // highbd data
                          BitDepthParams<uint16_t, 12, true> >
     AllBitDepthParams;
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
-                               AllBitDepthParams);
+                               AllBitDepthParams, );
 
 template <typename T>
 class NoiseModelUpdateTest : public ::testing::Test, public T {
@@ -968,8 +970,10 @@ REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
                             NoiseStrengthChangeSignalsDifferentNoiseType,
                             NoiseCoeffsSignalsDifferentNoiseType);
 
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation,
-                               NoiseModelUpdateTest, AllBitDepthParams);
+                               NoiseModelUpdateTest, AllBitDepthParams, );
 
 TEST(NoiseModelGetGrainParameters, TestLagSize) {
   aom_film_grain_t film_grain;
@@ -1368,5 +1372,7 @@ TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
 REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize,
                             InvalidChromaSubsampling, GradientTest);
 
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
-                               AllBitDepthParams);
+                               AllBitDepthParams, );
-- 
GitLab


From 19d0cc368112563e0020920ac2ee4f8cd8fd598e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 30 Apr 2024 16:11:18 -0700
Subject: [PATCH 088/391] Define pthread_attr_getstacksize/setstacksize

Bug: aomedia:2754, aomedia:3567
Change-Id: I0608ffe8f0e6fc2f99f8834408084308e51aaf3e
---
 aom_util/aom_pthread.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/aom_util/aom_pthread.h b/aom_util/aom_pthread.h
index 1a97a0a9db..425a6b00f1 100644
--- a/aom_util/aom_pthread.h
+++ b/aom_util/aom_pthread.h
@@ -72,6 +72,20 @@ static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
   return 0;
 }
 
+static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
+                                            size_t *stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
+static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
+                                            size_t stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
 static INLINE int pthread_create(pthread_t *const thread,
                                  const pthread_attr_t *attr,
                                  unsigned int(__stdcall *start)(void *),
-- 
GitLab


From f4eaf8b55e58102c3f9d2bab7658b9f6063ad400 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 30 Apr 2024 16:20:47 -0700
Subject: [PATCH 089/391] common/tools_common.h: port f{seek,tell}o fix from
 libvpx

https://chromium-review.googlesource.com/c/webm/libvpx/+/5074786
bf0755418 Add the needed Android API level predicates.

Bug: aomedia:3561
Change-Id: Ie5c4b3134f3842cd55e5b07e22dffa4ba2584ea8
---
 common/tools_common.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/common/tools_common.h b/common/tools_common.h
index b31371c670..9d891d1561 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -37,8 +37,13 @@ typedef int64_t FileOffset;
 #define fseeko fseeko64
 #define ftello ftello64
 typedef off64_t FileOffset;
-#elif CONFIG_OS_SUPPORT
-#include <sys/types.h> /* NOLINT*/
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
+#include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM
  * executables (.axf) with RVCT. */
-- 
GitLab


From ae7e8cc435bc055aee3c30934f97cf8f2bd2fa6e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 1 May 2024 10:08:46 -0700
Subject: [PATCH 090/391] resize_avx2.c: fix integer sanitizer conversion
 warning

add missing cast in call to _mm256_set1_epi8().

fixes:
implicit conversion from type 'uint8_t' (aka 'unsigned char') of value
255 (8-bit, unsigned) to type 'char' changed the value to -1 (8-bit,
signed)

Change-Id: I9bf7a217f650a7255ed35ab7075358cb5e8e14d8
---
 av1/common/x86/resize_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index c44edb88d9..47f015ae72 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -174,7 +174,7 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   const uint8_t max_pixel = 255;
-  const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
+  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
   const __m256i zero = _mm256_setzero_si256();
 
   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
-- 
GitLab


From 62732c87b889f20bc60b99c5675f7c2cc9ccdef1 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 30 Apr 2024 15:24:49 -0700
Subject: [PATCH 091/391] rtc: Extract some speed features used in screen

Extract some features used for screen and add
speed features for them: newmv skip for flat blocks,
usage of idtx, and the min_thresh for scene detection.
This is to allow for better control for content with
mixed screen and video; also some of these may be used
for non-screen input.

No change in behavior.

Change-Id: I29307032a283d19f8899d980baa61a7d9b0748a4
---
 av1/encoder/nonrd_pickmode.c | 10 ++++++----
 av1/encoder/ratectrl.c       |  2 +-
 av1/encoder/speed_features.c |  6 ++++++
 av1/encoder/speed_features.h | 16 ++++++++++++----
 4 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 08ecb8495a..dcaf13f1d9 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2469,7 +2469,9 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
         return true;
     }
     // Skip NEWMV search for flat blocks.
-    if (*this_mode == NEWMV && x->source_variance < 100) return true;
+    if (rt_sf->skip_newmv_flat_blocks_screen && *this_mode == NEWMV &&
+        x->source_variance < 100)
+      return true;
     // Skip non-LAST for color on flat blocks.
     if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
         (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
@@ -2960,9 +2962,9 @@ static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
 
   // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit
   // when issue 3359 is fixed.
-  if (cm->seq_params->bit_depth == 8 &&
-      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
-      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
+  if (cm->seq_params->bit_depth == 8 && rt_sf->use_idtx_nonrd &&
+      !skip_idtx_palette && !cpi->oxcf.txfm_cfg.use_inter_dct_only &&
+      !x->force_zeromv_skip_for_blk &&
       is_inter_mode(best_pickmode->best_mode) &&
       best_pickmode->best_pred != NULL &&
       (!rt_sf->prune_idtx_nonrd ||
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 7639484df5..9daeb45c89 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3073,7 +3073,7 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   }
   int num_zero_temp_sad = 0;
   uint32_t min_thresh = 10000;
-  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+  if (cpi->sf.rt_sf.higher_thresh_scene_detection) {
     min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
                      ? 50000
                      : 100000;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 9a00042520..b0ab7feb3b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1586,6 +1586,9 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.prune_palette_nonrd = 1;
       sf->rt_sf.increase_color_thresh_palette = 0;
     }
+    sf->rt_sf.skip_newmv_flat_blocks_screen = 1;
+    sf->rt_sf.use_idtx_nonrd = 1;
+    sf->rt_sf.higher_thresh_scene_detection = 0;
     sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.use_rtc_tf = 0;
     sf->rt_sf.use_comp_ref_nonrd = 0;
@@ -2270,6 +2273,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->var_part_split_threshold_shift = 7;
   rt_sf->gf_refresh_based_on_qp = 0;
   rt_sf->use_rtc_tf = 0;
+  rt_sf->use_idtx_nonrd = 0;
   rt_sf->prune_idtx_nonrd = 0;
   rt_sf->prune_palette_nonrd = 0;
   rt_sf->dct_only_palette_nonrd = 0;
@@ -2299,6 +2303,8 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->increase_color_thresh_palette = false;
   rt_sf->selective_cdf_update = 0;
   rt_sf->force_only_last_ref = 0;
+  rt_sf->higher_thresh_scene_detection = 1;
+  rt_sf->skip_newmv_flat_blocks_screen = 0;
 }
 
 static fractional_mv_step_fp
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d59cb38a71..d6b2949277 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1754,10 +1754,12 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // Must be off for lossless mode.
   int use_rtc_tf;
 
-  // Prune the use of the identity transform in nonrd_pickmode,
-  // used for screen content mode: only for smaller blocks
-  // and higher spatial variance, and when skip_txfm is not
-  // already set.
+  // Use of the identity transform in nonrd_pickmode,
+  int use_idtx_nonrd;
+
+  // Prune the use of the identity transform in nonrd_pickmode:
+  // only for smaller blocks and higher spatial variance, and when skip_txfm
+  // is not already set.
   int prune_idtx_nonrd;
 
   // Prune the use of paletter mode in nonrd pickmode.
@@ -1902,6 +1904,12 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // This generally leads to better coding efficiency but with some speed loss.
   // Only used for screen content and for nonrd_pickmode.
   bool increase_color_thresh_palette;
+
+  // Flag to indicate selecting of higher threshold for scenee change detection.
+  int higher_thresh_scene_detection;
+
+  // FLag to indicate skip testing of NEWMV for flat blocks.
+  int skip_newmv_flat_blocks_screen;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
-- 
GitLab


From 5ccdc66ab6eb8eb300eda854fab4ff250b2c2f92 Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Wed, 1 May 2024 00:45:41 +0300
Subject: [PATCH 092/391] cpu.cmake: Do more elaborate test of whether SVE can
 be compiled

For Windows targets, Clang will successfully compile simpler
SVE functions, but if the function requires backing up and restoring
SVE registers (as part of the AAPCS calling convention), Clang
will fail to generate unwind data for this function, resulting
in an error.

This issue is tracked upstream in Clang in
https://github.com/llvm/llvm-project/issues/80009.

Check whether the compiler can compile such a function, and
disable SVE if it is unable to handle that case.

Change-Id: I307d7398cedd1942c39ef034431a51696264ff47
---
 build/cmake/cpu.cmake | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 489dbcbf44..e16e9ec6a5 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -56,8 +56,18 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
 #endif
 #include <arm_sve.h>
 #include <arm_neon_sve_bridge.h>" HAVE_SVE_HEADERS)
+    # Check whether the compiler can compile SVE functions that require
+    # backup/restore of SVE registers according to AAPCS. Clang for Windows used
+    # to fail this, see https://github.com/llvm/llvm-project/issues/80009.
+    aom_check_source_compiles("arm_sve_preserve" "
+#include <arm_sve.h>
+void other(void);
+svfloat32_t func(svfloat32_t a) {
+  other();
+  return a;
+}" CAN_COMPILE_SVE)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
-    if(HAVE_SVE_HEADERS EQUAL 0)
+    if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0)
       set(ENABLE_SVE 0)
       set(ENABLE_SVE2 0)
     endif()
-- 
GitLab


From ef3470f53bd4468cc3d6def1b24f88c3d60a41b9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 26 Apr 2024 11:35:10 -0700
Subject: [PATCH 093/391] add variance_impl_ssse3.h

fixes -Wmissing-prototypes warnings

Bug: aomedia:3416
Change-Id: I68fc80f50734eddc5da6c1315c84effc2bceeebc
---
 aom_dsp/x86/jnt_variance_ssse3.c  | 11 +----------
 aom_dsp/x86/obmc_variance_sse4.c  | 11 +----------
 aom_dsp/x86/variance_impl_ssse3.c |  1 +
 aom_dsp/x86/variance_impl_ssse3.h | 27 +++++++++++++++++++++++++++
 4 files changed, 30 insertions(+), 20 deletions(-)
 create mode 100644 aom_dsp/x86/variance_impl_ssse3.h

diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index dd798ca54a..ed5b580b73 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -17,16 +17,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
+#include "aom_dsp/x86/variance_impl_ssse3.h"
 
 static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index 89b050eb20..164d0c28c9 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -22,21 +22,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/variance_impl_ssse3.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
                                      const int32_t *wsrc, const int32_t *mask,
                                      unsigned int *const sse, int *const sum,
diff --git a/aom_dsp/x86/variance_impl_ssse3.c b/aom_dsp/x86/variance_impl_ssse3.c
index 699002195b..952cca1aab 100644
--- a/aom_dsp/x86/variance_impl_ssse3.c
+++ b/aom_dsp/x86/variance_impl_ssse3.c
@@ -15,6 +15,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/variance_impl_ssse3.h"
 
 void aom_var_filter_block2d_bil_first_pass_ssse3(
     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
diff --git a/aom_dsp/x86/variance_impl_ssse3.h b/aom_dsp/x86/variance_impl_ssse3.h
new file mode 100644
index 0000000000..725b551c5c
--- /dev/null
+++ b/aom_dsp/x86/variance_impl_ssse3.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
+#define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
+
+#include <stdint.h>
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+#endif  // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
-- 
GitLab


From b44333201ba1d792c3bcb70603d730925d5f748a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 3 May 2024 15:54:22 -0700
Subject: [PATCH 094/391] *_neon.c: add missing rtcd includes & CONFIG check

fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Ia4f30b2d061b8c03ac0c0dac40d7fc4004705125
---
 aom_dsp/arm/fwd_txfm_neon.c             | 3 +++
 aom_dsp/arm/highbd_quantize_neon.c      | 1 +
 aom_dsp/arm/intrapred_neon.c            | 1 +
 aom_dsp/arm/subtract_neon.c             | 1 +
 av1/common/arm/highbd_reconintra_neon.c | 1 +
 av1/common/arm/reconintra_neon.c        | 1 +
 av1/common/arm/resize_neon.c            | 1 +
 av1/encoder/arm/av1_error_neon.c        | 3 ++-
 av1/encoder/arm/av1_k_means_neon.c      | 2 +-
 av1/encoder/arm/hash_arm_crc32.c        | 1 +
 av1/encoder/arm/hybrid_fwd_txfm_neon.c  | 1 +
 11 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index a4d6322f24..c87acfb86f 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -12,6 +12,7 @@
 #include <arm_neon.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/arm/mem_neon.h"
@@ -115,6 +116,7 @@ void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
   vst1q_s16(final_output + 1 * 8, out_23);
 }
 
+#if CONFIG_INTERNAL_STATS
 void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
   // stage 1
   int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -302,3 +304,4 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     vst1q_s16(&final_output[7 * 8], input_7);
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 6149c9f13e..b3514296af 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -14,6 +14,7 @@
 #include <string.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/quantize.h"
 
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c3716b3a78..55d7eb13a7 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -15,6 +15,7 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
diff --git a/aom_dsp/arm/subtract_neon.c b/aom_dsp/arm/subtract_neon.c
index a195c40d19..01ae835be0 100644
--- a/aom_dsp/arm/subtract_neon.c
+++ b/aom_dsp/arm/subtract_neon.c
@@ -12,6 +12,7 @@
 #include <arm_neon.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/av1/common/arm/highbd_reconintra_neon.c b/av1/common/arm/highbd_reconintra_neon.c
index 170491b504..8fd4a9941f 100644
--- a/av1/common/arm/highbd_reconintra_neon.c
+++ b/av1/common/arm/highbd_reconintra_neon.c
@@ -13,6 +13,7 @@
 #include <assert.h>
 
 #include "aom_dsp/arm/sum_neon.h"
+#include "config/av1_rtcd.h"
 
 #define MAX_UPSAMPLE_SZ 16
 
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index 3db39987a6..d31c4a9443 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -13,6 +13,7 @@
 #include <assert.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index b00ebd1fc2..a6d4b62964 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -16,6 +16,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
 #include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
diff --git a/av1/encoder/arm/av1_error_neon.c b/av1/encoder/arm/av1_error_neon.c
index 26d06b46fe..1d4299fec9 100644
--- a/av1/encoder/arm/av1_error_neon.c
+++ b/av1/encoder/arm/av1_error_neon.c
@@ -13,6 +13,7 @@
 #include <assert.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/mem_neon.h"
@@ -63,7 +64,7 @@ int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 }
 
 int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
+                                intptr_t block_size) {
   uint64x2_t err_u64 = vdupq_n_u64(0);
 
   assert(block_size >= 16);
diff --git a/av1/encoder/arm/av1_k_means_neon.c b/av1/encoder/arm/av1_k_means_neon.c
index d13cc65ae0..586376970f 100644
--- a/av1/encoder/arm/av1_k_means_neon.c
+++ b/av1/encoder/arm/av1_k_means_neon.c
@@ -12,7 +12,7 @@
 
 #include "aom_dsp/arm/sum_neon.h"
 #include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
   const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
diff --git a/av1/encoder/arm/hash_arm_crc32.c b/av1/encoder/arm/hash_arm_crc32.c
index 91fc1e00a5..6417839ede 100644
--- a/av1/encoder/arm/hash_arm_crc32.c
+++ b/av1/encoder/arm/hash_arm_crc32.c
@@ -19,6 +19,7 @@
 #include <stdint.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #define CRC_LOOP(op, crc, type, buf, len) \
   while ((len) >= sizeof(type)) {         \
diff --git a/av1/encoder/arm/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
index 6cf835a243..1d83bec168 100644
--- a/av1/encoder/arm/hybrid_fwd_txfm_neon.c
+++ b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
@@ -12,6 +12,7 @@
 #include <arm_neon.h>
 
 #include "aom_dsp/txfm_common.h"
+#include "config/av1_rtcd.h"
 
 static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
   int32x4x2_t b0 =
-- 
GitLab


From ad7b78b65f6f60eb2f193f2c069d0b841ec9dc88 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 3 May 2024 16:32:48 -0700
Subject: [PATCH 095/391] *_neon.c: make some functions static

fixes some -Wmissing-prototypes warnings

Bug: aomedia:3416
Change-Id: I571e0f5329808a3129a3b66f54d579da72e0eb86
---
 aom_dsp/arm/blend_a64_mask_neon.c   |  5 +++--
 aom_dsp/arm/highbd_intrapred_neon.c |  2 +-
 av1/common/arm/av1_inv_txfm_neon.c  | 12 +++++-------
 av1/common/arm/selfguided_neon.c    |  8 ++++----
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 1bc3b80310..48ff683e96 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -20,8 +20,9 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/blend.h"
 
-uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
-                                    uint16x8_t round_offset) {
+static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a,
+                                           uint16x8_t b,
+                                           uint16x8_t round_offset) {
   const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
 
   uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index dc47974c68..d5a0044d6c 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -1201,7 +1201,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)
 
 // For width 16 and above.
 #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
-  void highbd_smooth_h_##W##xh_neon(                                          \
+  static void highbd_smooth_h_##W##xh_neon(                                   \
       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
       const uint16_t *const left_column, const int height) {                  \
     const uint16_t top_right = top_row[(W)-1];                                \
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 09e5166b14..4a66b9016a 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -447,7 +447,7 @@ static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
   out[7] = step1;
 }
 
-void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
   assert(!(size % 4));
   if (!bit) return;
   const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
@@ -3661,7 +3661,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
     }
     row_txfm(cur_a, cur_a, INV_COS_BIT);
-    av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+    round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
     if (lr_flip == 1) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
         flip_buf_ud_neon(&cur_a[j * 8], 8);
@@ -3736,8 +3736,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
     col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
   }
   if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -4112,7 +4111,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
     }
     row_txfm(cur_a, cur_a, INV_COS_BIT);
-    av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+    round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
     if (lr_flip == 1) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
         flip_buf_ud_neon(&cur_a[j * 8], 8);
@@ -4130,8 +4129,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
     col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
   }
 
   if (txfm_size_col >= 16) {
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index 1d3a3cc038..08e298f7f3 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -1124,10 +1124,10 @@ static void final_filter_fast_internal(uint16_t *A, int32_t *B,
   } while (h > 0);
 }
 
-void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
-                           int16_t *src, const int src_stride, int32_t *dst,
-                           const int dst_stride, const int width,
-                           const int height) {
+static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                                  int16_t *src, const int src_stride,
+                                  int32_t *dst, const int dst_stride,
+                                  const int width, const int height) {
   int16x8_t s0;
   int32_t *B_tmp, *dst_ptr;
   uint16_t *A_tmp;
-- 
GitLab


From 01b991665819deb145adf58edbd49b1c3ee7032f Mon Sep 17 00:00:00 2001
From: Samuthirika S <samuthirika.s@ittiam.com>
Date: Fri, 3 May 2024 18:22:16 +0530
Subject: [PATCH 096/391] Add AVX2 for av1_resize_horz_dir()

This CL adds AVX2 implementation for av1_resize_horz_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160               3.16x
 2560x1440               3.25x
 1920x1080               3.24x
 1280x720                3.42x
 640x480                 3.80x
 640x360                 3.85x
 256x256                 5.53x

This is a bit-exact change.

Change-Id: I19160f5fe66b3d95abdb53b9ea443500baa71ec6
---
 av1/common/av1_rtcd_defs.pl  |   7 +-
 av1/common/resize.c          |  31 ++--
 av1/common/resize.h          |   3 +
 av1/common/x86/resize_avx2.c | 327 +++++++++++++++++++++++++++++++++--
 av1/common/x86/resize_sse2.c |  13 +-
 test/frame_resize_test.cc    | 117 +++++++++++--
 6 files changed, 449 insertions(+), 49 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3973d919bd..7035fb3bdd 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -554,8 +554,11 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
-add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir sse2 avx2/;
+add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/av1_resize_vert_dir sse2 avx2/;
+
+add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
+specialize qw/av1_resize_horz_dir avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 727f84fdbb..505fccd43b 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -337,8 +337,8 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
   return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
 
-static void down2_symeven(const uint8_t *const input, int length,
-                          uint8_t *output) {
+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset) {
   // Actual filter len = 2 * filter_len_half.
   const int16_t *filter = av1_down2_symeven_half_filter;
   const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
@@ -350,7 +350,7 @@ static void down2_symeven(const uint8_t *const input, int length,
   l2 += (l2 & 1);
   if (l1 > l2) {
     // Short input length.
-    for (i = 0; i < length; i += 2) {
+    for (i = start_offset; i < length; i += 2) {
       int sum = (1 << (FILTER_BITS - 1));
       for (j = 0; j < filter_len_half; ++j) {
         sum +=
@@ -362,7 +362,7 @@ static void down2_symeven(const uint8_t *const input, int length,
     }
   } else {
     // Initial part.
-    for (i = 0; i < l1; i += 2) {
+    for (i = start_offset; i < l1; i += 2) {
       int sum = (1 << (FILTER_BITS - 1));
       for (j = 0; j < filter_len_half; ++j) {
         sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
@@ -492,7 +492,7 @@ static void resize_multistep(const uint8_t *const input, int length,
       if (filteredlength & 1)
         down2_symodd(in, filteredlength, out);
       else
-        down2_symeven(in, filteredlength, out);
+        down2_symeven(in, filteredlength, out, 0);
       filteredlength = proj_filteredlength;
     }
     if (filteredlength != olength) {
@@ -521,8 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
-                       int height, int height2, int width2, int start_col) {
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+                           int height, int height2, int width2, int start_col) {
   bool mem_status = true;
   uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
   uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@@ -533,7 +533,7 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
 
   for (int i = start_col; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    down2_symeven(arrbuf, height, arrbuf2);
+    down2_symeven(arrbuf, height, arrbuf2, 0);
     fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
   }
 
@@ -543,11 +543,12 @@ Error:
   return mem_status;
 }
 
-static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
-                                   uint8_t *intbuf, int height,
-                                   int filtered_length, int width2) {
+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride,
+                           uint8_t *intbuf, int height, int filtered_length,
+                           int width2) {
   for (int i = 0; i < height; ++i)
-    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i,
+                  0);
 }
 
 bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
@@ -559,10 +560,10 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
   }
 
   // Resize in the horizontal direction
-  resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+  av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2);
   // Resize in the vertical direction
-  bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
-                                    width2, 0 /*start_col*/);
+  bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height,
+                                        height2, width2, 0 /*start_col*/);
   aom_free(intbuf);
   return mem_status;
 }
diff --git a/av1/common/resize.h b/av1/common/resize.h
index de71f5d539..6b233f8259 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -101,6 +101,9 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
                               int in_stride, uint8_t *output, int height2,
                               int width2, int out_stride);
 
+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset);
+
 bool should_resize_by_half(int height, int width, int height2, int width2);
 
 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 47f015ae72..f0421703c6 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -41,7 +41,7 @@
   s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
                                                                             \
   __m256i res_out[2] = { 0 };                                               \
-  resize_y_convolve(s, coeffs_y, res_out);                                  \
+  resize_convolve(s, coeffs_y, res_out);                                    \
                                                                             \
   /* r00... r07 */                                                          \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
@@ -52,7 +52,7 @@
   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
                                                                             \
   __m256i res_out_b[2] = { 0 };                                             \
-  resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
+  resize_convolve(s + 5, coeffs_y, res_out_b);                              \
                                                                             \
   /* r08... r015 */                                                         \
   __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
@@ -91,7 +91,7 @@
   s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
                                                                           \
   __m256i res_out[2] = { 0 };                                             \
-  resize_y_convolve(s, coeffs_y, res_out);                                \
+  resize_convolve(s, coeffs_y, res_out);                                  \
                                                                           \
   /* r00... r07 */                                                        \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
@@ -108,9 +108,107 @@
   res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
   res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
 
-static INLINE void resize_y_convolve(const __m256i *const s,
-                                     const __m256i *const coeffs,
-                                     __m256i *res_out) {
+#define PROCESS_RESIZE_X_WD32                                                  \
+  /* a0 a1 ..... a30 a31 */                                                    \
+  __m256i row0 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[i * in_stride + j - filter_offset]);                   \
+  /* b0 b1 ..... b30 b31 */                                                    \
+  __m256i row1 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]);             \
+  /* a0 .... a15 || b0.... b15 */                                              \
+  __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);                    \
+  /* a16 .... a31 || b16 .... b31 */                                           \
+  __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);                    \
+  filter_offset = 3;                                                           \
+                                                                               \
+  /* Pad start pixels to the left, while processing the first pixels in the    \
+    row. */                                                                    \
+  if (j == 0) {                                                                \
+    /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
+    row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
+    /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */                   \
+    row1 = _mm256_alignr_epi8(r1, r0, 13);                                     \
+    r0 = row0;                                                                 \
+    r1 = row1;                                                                 \
+  }                                                                            \
+                                                                               \
+  /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
+  __m128i row0_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[i * in_stride + 32 + j - filter_offset]);              \
+  /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
+  __m128i row1_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]);        \
+  __m256i r2 = _mm256_permute2x128_si256(                                      \
+      _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
+                                                                               \
+  /* Pad end pixels to the right, while processing the last pixels in the      \
+  row. */                                                                      \
+  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+  if (is_last_cols32) {                                                        \
+    r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask);                           \
+  }                                                                            \
+                                                                               \
+  /* Process even pixels of the first row  */                                  \
+  /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */                \
+  s0[0] = _mm256_alignr_epi8(r1, r0, 0);                                       \
+  /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */                \
+  s0[1] = _mm256_alignr_epi8(r1, r0, 2);                                       \
+  /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */                \
+  s0[2] = _mm256_alignr_epi8(r1, r0, 4);                                       \
+  /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */                \
+  s0[3] = _mm256_alignr_epi8(r1, r0, 6);                                       \
+                                                                               \
+  /* Process even pixels of the second row  */                                 \
+  /* a13 a14 a15 a16  ..... a28 | b13 b14 b15 b16 ..... b28 */                 \
+  s1[0] = _mm256_alignr_epi8(r2, r1, 0);                                       \
+  /* a15 a16 a17 a18  ..... a30 | b15 b16 b17 b18 ..... b30 */                 \
+  s1[1] = _mm256_alignr_epi8(r2, r1, 2);                                       \
+  /* a17 a18 a19 a20  ..... a32 | b17 b18 b19 b20 ..... b32 */                 \
+  s1[2] = _mm256_alignr_epi8(r2, r1, 4);                                       \
+  /* a19 a20 a21 a22  ..... a34 | b19 b20 b21 b22 ..... b34 */                 \
+  s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
+                                                                               \
+  /* The register res_out_0 stores the result of start-16 pixels corresponding \
+to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
+  __m256i res_out_0[2], res_out_1[2];                                          \
+  res_out_1[0] = res_out_1[1] = zero;                                          \
+  res_out_0[0] = res_out_0[1] = zero;                                          \
+  resize_convolve(s0, coeffs_x, res_out_0);                                    \
+  resize_convolve(s1, coeffs_x, res_out_1);                                    \
+                                                                               \
+  /* Result of 32 pixels of row0 (a0 to a32) */                                \
+  res_out_0[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);     \
+  res_out_1[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits);     \
+  /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
+  __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
+                                                                               \
+  /* result of 32 pixels of row1 (b0 to b32) */                                \
+  res_out_0[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
+  res_out_1[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits);     \
+  /* r10-r13 r18-r111 | r14-r17 r112-r115 */                                   \
+  __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]);        \
+                                                                               \
+  /* Convert the result from 16bit to 8bit */                                  \
+  /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115   \
+   */                                                                          \
+  __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1);           \
+  __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel);            \
+  res_out_row01 = _mm256_max_epu8(res_out_r01, zero);                          \
+  __m128i low_128 = CAST_LOW(res_out_row01);                                   \
+  __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1);               \
+                                                                               \
+  _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2],                 \
+                   _mm_unpacklo_epi32(low_128, high_128));                     \
+  _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
+                   _mm_unpackhi_epi32(low_128, high_128));
+
+static INLINE void resize_convolve(const __m256i *const s,
+                                   const __m256i *const coeffs,
+                                   __m256i *res_out) {
   const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
   const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
@@ -152,8 +250,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
   coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
 }
 
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
-                          int height, int height2, int stride, int start_col) {
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
   assert(start_col <= stride);
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
@@ -164,8 +263,8 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, start_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
   }
 
   __m256i s[10], coeffs_y[4];
@@ -404,8 +503,212 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
   }
 
   if (remain_col)
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, stride - remain_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);
 
   return true;
 }
+
+// Masks used for width 32 and 8 pixels, with left and right padding
+// requirements
+static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12,
+                                                    0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12 };
+
+static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2,
+                                                     0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const uint8_t wd8_right_padding_mask[32] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
+};
+
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Currently, Invoking C function for width less than 32. Optimize the below,
+  // by invoking SSE2 once the implementation for the same is available.
+  if (filtered_length < 32) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+    return;
+  }
+
+  const int filt_length = sizeof(av1_down2_symeven_half_filter);
+  assert(filt_length % 2 == 0);
+  (void)filt_length;
+
+  __m256i s0[4], s1[4], coeffs_x[4];
+
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+
+  const uint8_t max_pixel = 255;
+  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i wd32_start_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
+  const __m256i wd32_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
+  const __m256i wd8_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
+  // to generate output corresponding to 2 rows. To streamline the core loop and
+  // eliminate the need for conditional checks, the remaining columns (16 or 8)
+  // are processed separately.
+  if (filtered_length % 32 == 0) {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      for (int j = 0; j < filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+    }
+  } else {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      int remain_col = filtered_length % 32;
+      for (int j = 0; j + 32 <= filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+
+      int wd_processed = filtered_length - remain_col;
+      if (remain_col > 15) {
+        remain_col = filtered_length % 16;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // a16 a17 --- a23
+        row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+        // b16 b17 --- b23
+        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+
+        // a16-a23 x x x x| b16-b23 x x x x
+        __m256i r1 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols16 = wd_processed + 16 == filtered_length;
+        if (is_last_cols16) {
+          r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+        }
+
+        // a0 a1 --- a15 || b0 b1 --- b15
+        s0[0] = r0;
+        // a2 a3 --- a17 || b2 b3 --- b17
+        s0[1] = _mm256_alignr_epi8(r1, r0, 2);
+        // a4 a5 --- a19 || b4 b5 --- b19
+        s0[2] = _mm256_alignr_epi8(r1, r0, 4);
+        // a6 a7 --- a21 || b6 b7 --- b21
+        s0[3] = _mm256_alignr_epi8(r1, r0, 6);
+
+        // result for 16 pixels (a0 to a15) of row0 and row1
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 -r07
+        res_out_0[0] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
+        // r10-r17
+        res_out_0[1] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
+        // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m128i low_result =
+            CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
+        // r00-r03 r04-r07 r10-r13 r14-r17
+        low_result = _mm_shuffle_epi32(low_result, 0xd8);
+
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
+                         _mm_unpackhi_epi64(low_result, low_result));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      if (remain_col > 7) {
+        remain_col = filtered_length % 8;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+        if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+
+        // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
+        s0[0] = r0;
+        // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
+        s0[1] = _mm256_bsrli_epi128(r0, 2);
+        // a4 a5 a6 a7 a8 a9 a10 a10 |  b4 b5 b6 b7 b8 b9 b10 b10
+        s0[2] = _mm256_bsrli_epi128(r0, 4);
+        // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
+        s0[3] = _mm256_bsrli_epi128(r0, 6);
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 - r03 | r10 - r13
+        __m256i res_out =
+            _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
+        // r00 - r03 | r10 - r13
+        res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
+                                   round_shift_bits);
+        // r00-r03 r00-r03 r10-r13 r10-r13
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
+        // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+
+        *((int *)(intbuf + out_idx)) =
+            _mm_cvtsi128_si32(CAST_LOW(res_out_row01));
+        *((int *)(intbuf + out_idx + dst_stride)) =
+            _mm_cvtsi128_si32(_mm256_extracti128_si256(res_out_row01, 1));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      // When the remaining width is 2, the above code would not have taken
+      // care of padding required for (filtered_length - 4)th pixel. Hence,
+      // process that pixel again with the C code.
+      wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+      if (remain_col) {
+        const int in_idx = (in_stride * i);
+        const int out_idx = (wd_processed / 2) + width2 * i;
+
+        down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                      wd_processed);
+        down2_symeven(input + in_idx + in_stride, filtered_length,
+                      intbuf + out_idx + width2, wd_processed);
+      }
+    }
+  }
+}
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 9714ecf776..c68371cb06 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -81,8 +81,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
   coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
 }
 
-bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
-                          int height, int height2, int stride, int start_col) {
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
   // optimization of the same is not implemented.
@@ -92,8 +93,8 @@ bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, start_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
   }
 
   __m128i coeffs_y[2];
@@ -158,8 +159,8 @@ bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
   }
 
   if (remain_col)
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, stride - remain_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);
 
   return true;
 }
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index cab6fe354d..b0bcb7b978 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -79,14 +79,12 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
   }
 
   void RunTest() {
-    int width2 = width_, height2 = height_;
-
     for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
     for (int level = 1; level < n_levels_; level++) {
-      width2 = (width_ >> level);
-      height2 = (height_ >> level);
-      resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2,
-                        0);
+      const int width2 = (width_ >> level);
+      const int height2 = (height_ >> level);
+      av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
+                            width2, 0);
       test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
 
       AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
@@ -94,17 +92,15 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
   }
 
   void SpeedTest() {
-    int width2 = width_, height2 = height_;
-
     for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
     for (int level = 1; level < n_levels_; level++) {
-      width2 = (width_ >> level);
-      height2 = (height_ >> level);
+      const int width2 = (width_ >> level);
+      const int height2 = (height_ >> level);
       aom_usec_timer ref_timer;
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < kIters; j++) {
-        resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
-                          width2, 0);
+        av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
+                              width2, 0);
       }
       aom_usec_timer_mark(&ref_timer);
       const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -150,14 +146,107 @@ TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); }
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ResizeYTest,
-    ::testing::Combine(::testing::Values(resize_vert_dir_avx2),
+    ::testing::Combine(::testing::Values(av1_resize_vert_dir_avx2),
                        ::testing::ValuesIn(kFrameDim)));
 #endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1ResizeYTest,
-    ::testing::Combine(::testing::Values(resize_vert_dir_sse2),
+    ::testing::Combine(::testing::Values(av1_resize_vert_dir_sse2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
+typedef void (*LowBDResize_x_Func)(const uint8_t *const input, int in_stride,
+                                   uint8_t *intbuf, int height,
+                                   int filteredlength, int width2);
+
+typedef tuple<LowBDResize_x_Func, FrameDimension> Resize_x_TestParams;
+
+class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
+ public:
+  void SetUp() {
+    test_fun_ = GET_PARAM(0);
+    frame_dim_ = GET_PARAM(1);
+    width_ = std::get<0>(frame_dim_);
+    height_ = std::get<1>(frame_dim_);
+    const int msb = get_msb(AOMMIN(width_, height_));
+    n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+    src_ = (uint8_t *)aom_malloc(width_ * height_ * sizeof(*src_));
+    ref_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*ref_dest_));
+    test_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*test_dest_));
+  }
+
+  void RunTest() {
+    for (int i = 0; i < width_ * height_; ++i) src_[i] = rng_.Rand8();
+
+    for (int level = 1; level < n_levels_; ++level) {
+      const int width2 = (width_ >> level);
+      av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
+                            width2);
+      test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
+      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height_);
+    }
+  }
+
+  void SpeedTest() {
+    for (int i = 0; i < width_ * height_; ++i) src_[i] = rng_.Rand8();
+
+    for (int level = 1; level < n_levels_; ++level) {
+      const int width2 = (width_ >> level);
+      aom_usec_timer ref_timer;
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < kIters; ++j) {
+        av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
+                              width2);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+      aom_usec_timer tst_timer;
+      aom_usec_timer_start(&tst_timer);
+      for (int j = 0; j < kIters; ++j) {
+        test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
+      }
+      aom_usec_timer_mark(&tst_timer);
+      const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+      std::cout << "level: " << level << " [" << width2 << " x " << height_
+                << "] C time = " << ref_time << " , SIMD time = " << tst_time
+                << " scaling=" << float(1.00) * ref_time / tst_time << "x \n";
+    }
+  }
+
+  void TearDown() {
+    aom_free(src_);
+    aom_free(ref_dest_);
+    aom_free(test_dest_);
+  }
+
+ private:
+  LowBDResize_x_Func test_fun_;
+  FrameDimension frame_dim_;
+  int width_;
+  int height_;
+  int n_levels_;
+  uint8_t *src_;
+  uint8_t *ref_dest_;
+  uint8_t *test_dest_;
+  libaom_test::ACMRandom rng_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeXTest);
+
+TEST_P(AV1ResizeXTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ResizeXTest,
+    ::testing::Combine(::testing::Values(av1_resize_horz_dir_avx2),
                        ::testing::ValuesIn(kFrameDim)));
 #endif
 
-- 
GitLab


From ea03a8d1b4a4ce446aa4e95e189f3e9864a7afee Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 29 Apr 2024 14:56:58 -0700
Subject: [PATCH 097/391] frame_resize_test: fix -Wunused-const-variable
 warning

Protect `kFrameDim` with `#if HAVE_AVX2 || HAVE_SSE2`. Since the test
uses the _c implementation as the reference there is no point in adding
a C instantiation.

Change-Id: I9eafd7384261bc29675385e7ed938b75972d5218
---
 test/frame_resize_test.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index b0bcb7b978..7a4da45973 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -29,13 +29,6 @@ const int kIters = 1000;
 
 typedef tuple<int, int> FrameDimension;
 
-// Resolutions (width x height) to be tested for resizing.
-const FrameDimension kFrameDim[] = {
-  make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
-  make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
-  make_tuple(256, 256),
-};
-
 // Check that two 8-bit output buffers are identical.
 void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
                           int height) {
@@ -143,6 +136,15 @@ TEST_P(AV1ResizeYTest, RunTest) { RunTest(); }
 
 TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); }
 
+#if HAVE_AVX2 || HAVE_SSE2
+// Resolutions (width x height) to be tested for resizing.
+const FrameDimension kFrameDim[] = {
+  make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
+  make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
+  make_tuple(256, 256),
+};
+#endif
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ResizeYTest,
-- 
GitLab


From d97381f8656a8a16d9be29f89378bc7590bb7d02 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 2 May 2024 11:35:58 -0700
Subject: [PATCH 098/391] rtc: Palette mode for nonrd intra pickmode

Add the palette mode to the nord intra pickmode,
and disable hybrid_intra_pickmode for screen mode
with speed >= 10. Increase the dist_thresh to test
paletter for speed 11.

This makes key frame encoding much faster for screen:
~2x faster with little quality loss.

Bug: b/337757868

Change-Id: Iffc5f6a83615d7901e90917f6adb586d89a24879
---
 av1/encoder/nonrd_pickmode.c | 36 ++++++++++++++++++++++++++++++++++++
 av1/encoder/speed_features.c |  1 +
 test/svc_datarate_test.cc    |  6 +++---
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index dcaf13f1d9..6a734cbd1c 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1648,6 +1648,42 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     }
   }
 
+  const int64_t thresh_dist = cpi->sf.rt_sf.prune_palette_nonrd ? 80000 : 20000;
+  const int64_t best_dist_norm = best_rdc.dist >> (b_width_log2_lookup[bsize] +
+                                                   b_height_log2_lookup[bsize]);
+
+  // Try palette if it's enabled.
+  bool try_palette =
+      best_dist_norm > thresh_dist && cpi->oxcf.tool_cfg.enable_palette &&
+      bsize <= BLOCK_16X16 && x->source_variance > 200 &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mi->bsize);
+  if (try_palette) {
+    const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+    const unsigned int intra_ref_frame_cost = 0;
+    // Search palette mode for Luma plane in intra frame.
+    av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
+                                 &this_rdc, best_rdc.rdcost);
+    // Update best mode data.
+    if (this_rdc.rdcost < best_rdc.rdcost &&
+        this_rdc.rate < (3 * (best_rdc.rate >> 1))) {
+      best_mode = DC_PRED;
+      mi->mv[0].as_int = INVALID_MV;
+      mi->mv[1].as_int = INVALID_MV;
+      best_rdc.rate = this_rdc.rate;
+      best_rdc.dist = this_rdc.dist;
+      best_rdc.rdcost = this_rdc.rdcost;
+      if (!this_rdc.skip_txfm) {
+        memcpy(ctx->blk_skip, txfm_info->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+      if (xd->tx_type_map[0] != DCT_DCT)
+        av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    } else {
+      av1_zero(mi->palette_mode_info);
+    }
+  }
+
   mi->mode = best_mode;
   // Keep DC for UV since mode test is based on Y channel only.
   mi->uv_mode = UV_DC_PRED;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index b0ab7feb3b..25c0b3af41 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1578,6 +1578,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.part_early_exit_zeromv = 1;
       sf->rt_sf.nonrd_aggressive_skip = 1;
       sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
+      sf->rt_sf.hybrid_intra_pickmode = 0;
     }
     if (speed >= 11) {
       sf->rt_sf.skip_lf_screen = 2;
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index cc3fb674b3..28f795cf2a 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -986,7 +986,7 @@ class DatarateTestSVC
 
     ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 60);
 
-    const int bitrate_array[2] = { 800, 1200 };
+    const int bitrate_array[2] = { 1000, 1500 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
     screen_mode_ = 1;
@@ -997,9 +997,9 @@ class DatarateTestSVC
     target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
-      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.40)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 2.0)
           << " The datarate for the file is greater than target by too much!";
     }
     // Top temporal layers are non_reference, so exlcude them from
-- 
GitLab


From fb21617c1f3ef49795597e006b68adfba6e54be0 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sat, 4 May 2024 13:20:42 +0100
Subject: [PATCH 099/391] cpu.cmake: Address issues in SVE feature tests

A test to check that SVE registers were correctly handled as function
parameters was added in 5ccdc66ab6eb8eb300eda854fab4ff250b2c2f92,
however this appears to have a couple of issues:

* Semicolons need to be escaped, else the compiler fails to compile due
  to invalid syntax. We can fix this by prefixing each semicolon with a
  backslash.

* The "other" function does not have a definition so the test program
  will always fail to link even if it compiles to an object file. We can
  work around this by instructing CMake to only try compiling up to a
  static library rather than a full executable.

Change-Id: Ic37280d4b42b9031e68bed8a4b24c0eb51491827
---
 build/cmake/cpu.cmake | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index e16e9ec6a5..8d0acf3d2b 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -49,7 +49,9 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
   # SVE and SVE2 require that the Neon-SVE bridge header is also available.
   if(ENABLE_SVE OR ENABLE_SVE2)
     set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE})
     set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}")
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
     aom_check_source_compiles("arm_neon_sve_bridge_available" "
 #ifndef __ARM_NEON_SVE_BRIDGE
 #error 1
@@ -61,12 +63,13 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
     # to fail this, see https://github.com/llvm/llvm-project/issues/80009.
     aom_check_source_compiles("arm_sve_preserve" "
 #include <arm_sve.h>
-void other(void);
+void other(void)\;
 svfloat32_t func(svfloat32_t a) {
-  other();
-  return a;
+  other()\;
+  return a\;
 }" CAN_COMPILE_SVE)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE})
     if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0)
       set(ENABLE_SVE 0)
       set(ENABLE_SVE2 0)
-- 
GitLab


From dadb003877190556961f36e67d52495cc1742a54 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 24 Apr 2024 17:18:24 +0100
Subject: [PATCH 100/391] Add Neon Dotprod implementation of av1_convolve_y_sr
 for 12-tap

Add an Armv8.4 implementation of av1_convolve_y_sr for 12-tap filters.
This gives between 20 and 50% upflift over the Armv8.0 implementation.

Change-Id: Icd163f4f4c5c56b899d268b91d79b5733724eab4
---
 av1/common/arm/convolve_neon_dotprod.c | 345 +++++++++++++++++++++++++
 av1/common/av1_rtcd_defs.pl            |   2 +-
 test/av1_convolve_test.cc              |   5 +
 3 files changed, 351 insertions(+), 1 deletion(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index d670657f84..bf945c6fa8 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -27,6 +27,15 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
 static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
                                        const int32x4_t correction,
@@ -421,6 +430,342 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
   } while (h != 0);
 }
 
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+  *b = vreinterpretq_s8_s16(a0123);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8x2_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+  *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+  *b1 = vreinterpretq_s8_s16(a0123.val[1]);
+}
+
+static INLINE int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
+                                       const int8x16_t s2,
+                                       const int8x8_t filters_0_7,
+                                       const int8x8_t filters_4_11) {
+  // The sample range transform and permutation are performed by the caller.
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+  int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0);
+  sum = vdotq_lane_s32(sum, s1, filters_0_7, 1);
+  sum = vdotq_lane_s32(sum, s2, filters_4_11, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve12_8_y(
+    const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo,
+    const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi,
+    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
+  // The sample range transform and permutation are performed by the caller.
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0);
+  sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
+  sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
+
+  int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0);
+  sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
+  sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_y_sr_12tap_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr) {
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (y_filter_ptr[5] == 128) {
+    // Undo the vertical offset in the calling function.
+    src_ptr += 5 * src_stride;
+
+    do {
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x8_t d0 = vld1_u8(s);
+        if (w == 4) {
+          store_u8_4x1(d, d0);
+        } else {
+          vst1_u8(d, d0);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+    if (w == 4) {
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+      load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
+                   &t8, &t9, &tA);
+      src_ptr += 11 * src_stride;
+
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
+
+      int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+
+      do {
+        uint8x8_t tB, tC, tD, tE;
+        load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
+
+        int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
+        int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
+        int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
+        int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
+
+        int8x16_t s89AB, s9ABC, sABCD, sBCDE;
+        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s789A, sBCDE } };
+        s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        int16x4_t d0 =
+            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+        int16x4_t d1 =
+            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+        int16x4_t d2 =
+            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+        int16x4_t d3 =
+            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123 = s4567;
+        s1234 = s5678;
+        s2345 = s6789;
+        s3456 = s789A;
+        s4567 = s89AB;
+        s5678 = s9ABC;
+        s6789 = sABCD;
+        s789A = sBCDE;
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+    } else {
+      do {
+        int height = h;
+        const uint8_t *s = src_ptr;
+        uint8_t *d = dst_ptr;
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+        load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+                     &t9, &tA);
+        s += 11 * src_stride;
+
+        // Transform sample range to [-128, 127] for 8-bit signed dot product.
+        int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+        int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+        int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+        int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+        int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+        int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+        int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
+
+        // This operation combines a conventional transpose and the sample
+        // permute (see horizontal case) required before computing the dot
+        // product.
+        int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
+            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
+        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+
+        do {
+          uint8x8_t tB, tC, tD, tE;
+          load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
+
+          int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
+          int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
+          int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
+          int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
+
+          int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+              sBCDE_lo, sBCDE_hi;
+          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+
+          // Merge new data into block from previous iteration.
+          int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+          s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
+          s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
+          sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+          int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+          s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
+          s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
+          sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+          uint8x8_t d0 =
+              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                             s89AB_hi, filter_0_7, filter_4_11);
+          uint8x8_t d1 =
+              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                             s9ABC_hi, filter_0_7, filter_4_11);
+          uint8x8_t d2 =
+              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                             sABCD_hi, filter_0_7, filter_4_11);
+          uint8x8_t d3 =
+              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                             sBCDE_hi, filter_0_7, filter_4_11);
+
+          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+          // Prepare block for next iteration - re-using as much as possible.
+          // Shuffle everything up four rows.
+          s0123_lo = s4567_lo;
+          s0123_hi = s4567_hi;
+          s1234_lo = s5678_lo;
+          s1234_hi = s5678_hi;
+          s2345_lo = s6789_lo;
+          s2345_hi = s6789_hi;
+          s3456_lo = s789A_lo;
+          s3456_hi = s789A_hi;
+          s4567_lo = s89AB_lo;
+          s4567_hi = s89AB_hi;
+          s5678_lo = s9ABC_lo;
+          s5678_hi = s9ABC_hi;
+          s6789_lo = sABCD_lo;
+          s6789_hi = sABCD_hi;
+          s789A_lo = sBCDE_lo;
+          s789A_hi = sBCDE_hi;
+
+          s += 4 * src_stride;
+          d += 4 * dst_stride;
+          height -= 4;
+        } while (height != 0);
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w != 0);
+    }
+  }
+}
+
+void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_y_qn) {
+  if (w == 2 || h == 2) {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    return;
+  }
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+
+  if (y_filter_taps <= 8) {
+    av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_y, subpel_y_qn);
+    return;
+  }
+
+  const int vert_offset = y_filter_taps / 2 - 1;
+  src -= vert_offset * src_stride;
+
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+                                   y_filter_ptr);
+}
+
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
                                           const int8x16_t filters,
                                           const int32x4_t correction,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7035fb3bdd..59d70f0e81 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -604,7 +604,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_convolve_2d_sr_intrabc neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_x_sr_intrabc neon/;
-  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod/;
   specialize qw/av1_convolve_y_sr_intrabc neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
   specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index b2392276cc..96c060349a 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -827,6 +827,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
                          BuildLowbdParams(av1_convolve_y_sr_neon));
 #endif
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_neon_dotprod));
+#endif
+
 ////////////////////////////////////////////////////////////////
 // Single reference convolve-y IntraBC functions (low bit-depth)
 ////////////////////////////////////////////////////////////////
-- 
GitLab


From 766e37a42fff20511a81f7a0300eaf968e078ce6 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 25 Apr 2024 10:41:28 +0100
Subject: [PATCH 101/391] Add 8-tap path for av1_convolve_y_sr_neon_dotprod

Add 8-tap specialisation for av1_convolve_y_sr_neon_dotprod. This gives
around 10-20% uplift over the Neon implementation.

Change-Id: I913df3892ab47dc0ee1d0f28ad62de861bb4320d
---
 av1/common/arm/convolve_neon_dotprod.c | 196 ++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 3 deletions(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index bf945c6fa8..393f2e81f9 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -738,6 +738,190 @@ static INLINE void convolve_y_sr_12tap_neon_dotprod(
   }
 }
 
+static INLINE int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+  int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
+  sum = vdotq_lane_s32(sum, s1, filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
+                                      const int8x16_t s0_hi,
+                                      const int8x16_t s1_lo,
+                                      const int8x16_t s1_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0);
+  sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1);
+
+  int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0);
+  sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_y_sr_8tap_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+    load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src_ptr += 7 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &t10);
+
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+      int8x16_t s4567, s5678, s6789, s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+        int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
+            s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+        s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+        int8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+        s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
                                     uint8_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_y,
@@ -750,7 +934,7 @@ void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
 
-  if (y_filter_taps <= 8) {
+  if (y_filter_taps <= 6) {
     av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
                            filter_params_y, subpel_y_qn);
     return;
@@ -762,8 +946,14 @@ void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
-                                   y_filter_ptr);
+  if (y_filter_taps > 8) {
+    convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+                                     y_filter_ptr);
+    return;
+  }
+
+  convolve_y_sr_8tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+                                  y_filter_ptr);
 }
 
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
-- 
GitLab


From a09494a4c34c2f6d7bbd503af26bfa98dc02d93c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 25 Apr 2024 14:24:04 +0100
Subject: [PATCH 102/391] Add Neon I8MM implementation of av1_convolve_y_sr for
 12-tap

Add an Armv8.6 implementation of av1_convolve_y_sr for 12-tap filters.
This gives and extra 10-20% upflift over the Neon Dotprod
implementation.

Change-Id: Iea303c9cb5c6ac04e28d228cc44aef0808e5eeb2
---
 av1/common/arm/convolve_neon_i8mm.c | 307 ++++++++++++++++++++++++++++
 av1/common/av1_rtcd_defs.pl         |   2 +-
 test/av1_convolve_test.cc           |   5 +
 3 files changed, 313 insertions(+), 1 deletion(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 3fe2c98d6b..8543190cbd 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -27,6 +27,15 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
 static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl,
@@ -375,6 +384,304 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
   } while (h != 0);
 }
 
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+  *b = vreinterpretq_u8_u16(a0123);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8x2_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+  *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+  *b1 = vreinterpretq_u8_u16(a0123.val[1]);
+}
+
+static INLINE int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
+                                       const uint8x16_t s2,
+                                       const int8x8_t filters_0_7,
+                                       const int8x8_t filters_4_11) {
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0);
+  sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1);
+  sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve12_8_y(
+    const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo,
+    const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi,
+    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
+  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
+  sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
+
+  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
+  sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
+                                                 int src_stride,
+                                                 uint8_t *dst_ptr,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *y_filter_ptr) {
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (y_filter_ptr[5] == 128) {
+    // Undo the vertical offset in the calling function.
+    src_ptr += 5 * src_stride;
+
+    do {
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x8_t d0 = vld1_u8(s);
+        if (w == 4) {
+          store_u8_4x1(d, d0);
+        } else {
+          vst1_u8(d, d0);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+    if (w == 4) {
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+      load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+                   &s8, &s9, &sA);
+      src_ptr += 11 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+
+      do {
+        uint8x8_t sB, sC, sD, sE;
+        load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
+
+        uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
+        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
+        s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        int16x4_t d0 =
+            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+        int16x4_t d1 =
+            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+        int16x4_t d2 =
+            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+        int16x4_t d3 =
+            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123 = s4567;
+        s1234 = s5678;
+        s2345 = s6789;
+        s3456 = s789A;
+        s4567 = s89AB;
+        s5678 = s9ABC;
+        s6789 = sABCD;
+        s789A = sBCDE;
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+    } else {
+      do {
+        int height = h;
+        const uint8_t *s = src_ptr;
+        uint8_t *d = dst_ptr;
+
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+        load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                     &s9, &sA);
+        s += 11 * src_stride;
+
+        // This operation combines a conventional transpose and the sample
+        // permute (see horizontal case) required before computing the dot
+        // product.
+        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
+            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
+        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+
+        do {
+          uint8x8_t sB, sC, sD, sE;
+          load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+          uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+              sBCDE_lo, sBCDE_hi;
+          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+
+          // Merge new data into block from previous iteration.
+          uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+          s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+          s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+          sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+          uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+          s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+          s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+          sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+          uint8x8_t d0 =
+              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                             s89AB_hi, filter_0_7, filter_4_11);
+          uint8x8_t d1 =
+              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                             s9ABC_hi, filter_0_7, filter_4_11);
+          uint8x8_t d2 =
+              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                             sABCD_hi, filter_0_7, filter_4_11);
+          uint8x8_t d3 =
+              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                             sBCDE_hi, filter_0_7, filter_4_11);
+
+          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+          // Prepare block for next iteration - re-using as much as possible.
+          // Shuffle everything up four rows.
+          s0123_lo = s4567_lo;
+          s0123_hi = s4567_hi;
+          s1234_lo = s5678_lo;
+          s1234_hi = s5678_hi;
+          s2345_lo = s6789_lo;
+          s2345_hi = s6789_hi;
+          s3456_lo = s789A_lo;
+          s3456_hi = s789A_hi;
+          s4567_lo = s89AB_lo;
+          s4567_hi = s89AB_hi;
+          s5678_lo = s9ABC_lo;
+          s5678_hi = s9ABC_hi;
+          s6789_lo = sABCD_lo;
+          s6789_hi = sABCD_hi;
+          s789A_lo = sBCDE_lo;
+          s789A_hi = sBCDE_hi;
+
+          s += 4 * src_stride;
+          d += 4 * dst_stride;
+          height -= 4;
+        } while (height != 0);
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w != 0);
+    }
+  }
+}
+
+void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_y_qn) {
+  if (w == 2 || h == 2) {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    return;
+  }
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+
+  if (y_filter_taps <= 8) {
+    av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_y, subpel_y_qn);
+    return;
+  }
+
+  const int vert_offset = y_filter_taps / 2 - 1;
+  src -= vert_offset * src_stride;
+
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                y_filter_ptr);
+}
+
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
                                           const int8x16_t filters,
                                           const uint8x16x3_t permute_tbl,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 59d70f0e81..eca260cce5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -604,7 +604,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_convolve_2d_sr_intrabc neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_x_sr_intrabc neon/;
-  specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_y_sr_intrabc neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
   specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 96c060349a..26a4b5e5d5 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -832,6 +832,11 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveYTest,
                          BuildLowbdParams(av1_convolve_y_sr_neon_dotprod));
 #endif
 
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_neon_i8mm));
+#endif
+
 ////////////////////////////////////////////////////////////////
 // Single reference convolve-y IntraBC functions (low bit-depth)
 ////////////////////////////////////////////////////////////////
-- 
GitLab


From 02d490b4554ede1c0b84930ff4b41422d57f2a32 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 25 Apr 2024 14:25:06 +0100
Subject: [PATCH 103/391] Add 8-tap path for av1_convolve_y_sr_neon_i8mm

Add 8-tap specialisation for av1_convolve_y_sr_neon_i8mm. This gives
an extra 10-25% uplift over the Neon Dotprod implementation.

Change-Id: Icf52b0ed7f7fb8eb55d6127fa215acf10d1e2e0f
---
 av1/common/arm/convolve_neon_i8mm.c | 193 +++++++++++++++++++++++++++-
 1 file changed, 190 insertions(+), 3 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 8543190cbd..f8b11eb358 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -654,6 +654,188 @@ static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
   }
 }
 
+static INLINE int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
+                                      const int8x8_t filters) {
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
+  sum = vusdotq_lane_s32(sum, s1, filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
+                                      const uint8x16_t s0_hi,
+                                      const uint8x16_t s1_lo,
+                                      const uint8x16_t s1_hi,
+                                      const int8x8_t filters) {
+  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1);
+
+  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                uint8_t *dst_ptr,
+                                                int dst_stride, int w, int h,
+                                                const int16_t *y_filter_ptr) {
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (y_filter_ptr[5] == 128) {
+    // Undo the vertical offset in the calling function.
+    src_ptr += 5 * src_stride;
+
+    do {
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x8_t d0 = vld1_u8(s);
+        if (w == 4) {
+          store_u8_4x1(d, d0);
+        } else {
+          vst1_u8(d, d0);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+
+    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+    if (w == 4) {
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      src_ptr += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123, s1234, s2345, s3456;
+      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+
+      do {
+        uint8x8_t s7, s8, s9, s10;
+        load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+        uint8x16_t s4567, s5678, s6789, s78910;
+        transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+        s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
+        int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
+        int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
+        int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
+        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123 = s4567;
+        s1234 = s5678;
+        s2345 = s6789;
+        s3456 = s78910;
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+    } else {
+      do {
+        int height = h;
+        const uint8_t *s = src_ptr;
+        uint8_t *d = dst_ptr;
+
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+        load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+        s += 7 * src_stride;
+
+        // This operation combines a conventional transpose and the sample
+        // permute (see horizontal case) required before computing the dot
+        // product.
+        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+            s3456_lo, s3456_hi;
+        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+        do {
+          uint8x8_t s7, s8, s9, s10;
+          load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+          uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
+              s78910_lo, s78910_hi;
+          transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+          // Merge new data into block from previous iteration.
+          uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+          s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+          s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+          s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+          uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+          s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+          s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+          s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+          uint8x8_t d0 =
+              convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
+          uint8x8_t d1 =
+              convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
+          uint8x8_t d2 =
+              convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
+          uint8x8_t d3 =
+              convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
+
+          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+          // Prepare block for next iteration - re-using as much as possible.
+          // Shuffle everything up four rows.
+          s0123_lo = s4567_lo;
+          s0123_hi = s4567_hi;
+          s1234_lo = s5678_lo;
+          s1234_hi = s5678_hi;
+          s2345_lo = s6789_lo;
+          s2345_hi = s6789_hi;
+          s3456_lo = s78910_lo;
+          s3456_hi = s78910_hi;
+
+          s += 4 * src_stride;
+          d += 4 * dst_stride;
+          height -= 4;
+        } while (height != 0);
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w != 0);
+    }
+  }
+}
+
 void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams *filter_params_y,
@@ -666,7 +848,7 @@ void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
 
-  if (y_filter_taps <= 8) {
+  if (y_filter_taps <= 6) {
     av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
                            filter_params_y, subpel_y_qn);
     return;
@@ -678,8 +860,13 @@ void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
-                                y_filter_ptr);
+  if (y_filter_taps > 8) {
+    convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                  y_filter_ptr);
+    return;
+  }
+  convolve_y_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                               y_filter_ptr);
 }
 
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
-- 
GitLab


From 8c231e78ca6dd81faeb0e45e582718d4f90a58a0 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 7 May 2024 17:30:37 -0700
Subject: [PATCH 104/391] Add a test that reproduces bug oss-fuzz:68195

Bug: oss-fuzz:68195
Change-Id: I2fa4343a61c8e40f5ceace9d59bba5f94bab7989
---
 test/wiener_test.cc | 91 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 3f9ce9ff41..90cf6bbf87 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -1719,6 +1719,97 @@ TEST(SearchWienerTest, 8bitSignedIntegerOverflowInUpdateBSepSym) {
   EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
 }
 
+// A test that reproduces crbug.com/oss-fuzz/68195: signed integer overflow in
+// linsolve_wiener().
+TEST(SearchWienerTest, DISABLED_8bitSignedIntegerOverflowInLinsolveWiener) {
+  constexpr int kWidth = 4;
+  constexpr int kHeight = 3;
+  constexpr unsigned char kBuffer[kWidth * kHeight] = {
+    // Y plane:
+    50, 167, 190, 194, 27, 29, 204, 182, 133, 239, 64, 179,
+  };
+  unsigned char *img_data = const_cast<unsigned char *>(kBuffer);
+
+  aom_image_t img;
+  EXPECT_EQ(&img,
+            aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1, img_data));
+  img.cp = AOM_CICP_CP_UNSPECIFIED;
+  img.tc = AOM_CICP_TC_UNSPECIFIED;
+  img.mc = AOM_CICP_MC_UNSPECIFIED;
+  img.monochrome = 1;
+  img.csp = AOM_CSP_UNKNOWN;
+  img.range = AOM_CR_FULL_RANGE;
+  img.planes[1] = img.planes[2] = nullptr;
+  img.stride[1] = img.stride[2] = 0;
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+  cfg.rc_end_usage = AOM_Q;
+  cfg.g_profile = 0;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  cfg.g_threads = 32;
+  cfg.monochrome = 1;
+  cfg.rc_min_quantizer = 50;
+  cfg.rc_max_quantizer = 57;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 53));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 1));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 1));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // Encode frame
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_EQ(pkt, nullptr);
+
+  // Encode frame
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x0.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
 // A test that reproduces b/259173819: signed integer overflow in
 // linsolve_wiener().
 TEST(SearchWienerTest, 10bitSignedIntegerOverflowInLinsolveWiener) {
-- 
GitLab


From a7479a950e02083acf20037b44ed2bb4b052cb34 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 8 May 2024 11:02:00 -0700
Subject: [PATCH 105/391] av1_set_active_map: assert num_samples != 0

This quiets a static analysis warning. mi_rows and mi_cols are always
non-zero, so num_samples will be at least 1.

Change-Id: I1f77a84e4789d4bc4c1a2feb264b1d1b494a1ca8
---
 av1/encoder/encoder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 85d980bae9..093eabc075 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -179,6 +179,7 @@ int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
       }
       cpi->active_map.enabled = 1;
       cpi->active_map.update = 1;
+      assert(num_samples);
       cpi->rc.percent_blocks_inactive =
           (num_blocks_inactive * 100) / num_samples;
     }
-- 
GitLab


From 3944d9b3b2accec4cf2b442842f99a9a915f0772 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 8 May 2024 10:58:55 -0700
Subject: [PATCH 106/391] rtc: Adjustments for nonrd keyframe for screen

For speed 11 keyframes in screen mode:
lower the dist_thresh, and include some pruning of
the H/V/SMOOTH modes based on existing speed features.

This brings some quality back with little speed loss.

Change-Id: I94b8a6429c6f9feca9c0e493a336a0b0d594bc7e
---
 av1/encoder/nonrd_pickmode.c | 5 ++---
 av1/encoder/speed_features.c | 3 +++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 6a734cbd1c..4032d942fb 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1648,7 +1648,7 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     }
   }
 
-  const int64_t thresh_dist = cpi->sf.rt_sf.prune_palette_nonrd ? 80000 : 20000;
+  const int64_t thresh_dist = cpi->sf.rt_sf.prune_palette_nonrd ? 60000 : 20000;
   const int64_t best_dist_norm = best_rdc.dist >> (b_width_log2_lookup[bsize] +
                                                    b_height_log2_lookup[bsize]);
 
@@ -1665,8 +1665,7 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
                                  &this_rdc, best_rdc.rdcost);
     // Update best mode data.
-    if (this_rdc.rdcost < best_rdc.rdcost &&
-        this_rdc.rate < (3 * (best_rdc.rate >> 1))) {
+    if (this_rdc.rdcost < best_rdc.rdcost) {
       best_mode = DC_PRED;
       mi->mv[0].as_int = INVALID_MV;
       mi->mv[1].as_int = INVALID_MV;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 25c0b3af41..4ddd4903a8 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1586,6 +1586,9 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.part_early_exit_zeromv = 2;
       sf->rt_sf.prune_palette_nonrd = 1;
       sf->rt_sf.increase_color_thresh_palette = 0;
+      sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
+      sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
+      sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
     sf->rt_sf.skip_newmv_flat_blocks_screen = 1;
     sf->rt_sf.use_idtx_nonrd = 1;
-- 
GitLab


From 085fe9c36385f78a53362d2f4f8e12dd4670360e Mon Sep 17 00:00:00 2001
From: Samuthirika S <samuthirika.s@ittiam.com>
Date: Wed, 8 May 2024 22:03:08 +0530
Subject: [PATCH 107/391] Fix unaligned store issue in
 av1_resize_horz_dir_avx2()

This CL fixes the unaligned store issue in
`av1_resize_horz_dir_avx2()` by using `xx_storel_32`.

BUG=aomedia:3570

Change-Id: I98138d19a39e000b650e4f64fa950a0f583b3b4b
---
 av1/common/x86/resize_avx2.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index f0421703c6..38bbc2626d 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -689,10 +689,9 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
         res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
         res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
 
-        *((int *)(intbuf + out_idx)) =
-            _mm_cvtsi128_si32(CAST_LOW(res_out_row01));
-        *((int *)(intbuf + out_idx + dst_stride)) =
-            _mm_cvtsi128_si32(_mm256_extracti128_si256(res_out_row01, 1));
+        xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
+        xx_storel_32(intbuf + out_idx + dst_stride,
+                     _mm256_extracti128_si256(res_out_row01, 1));
       }
 
       wd_processed = filtered_length - remain_col;
-- 
GitLab


From c612b961347bce8c3a9890de14ed5726dc219d83 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 8 May 2024 23:46:26 -0700
Subject: [PATCH 108/391] rtc: Use best_sad for pruning palette in nonrd key

Affects screen mode for speed >= 10: use the best_sad
to skip palette testing, instead of best_dist. This is
more consistent and improves in particular lower Q behavior
for all intra coding.

Change-Id: I832d61f6a6a4fbe39e6978556463bfab08784011
---
 av1/encoder/nonrd_pickmode.c | 12 +++++++-----
 av1/encoder/speed_features.c |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 4032d942fb..ed64056e40 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1648,14 +1648,16 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     }
   }
 
-  const int64_t thresh_dist = cpi->sf.rt_sf.prune_palette_nonrd ? 60000 : 20000;
-  const int64_t best_dist_norm = best_rdc.dist >> (b_width_log2_lookup[bsize] +
-                                                   b_height_log2_lookup[bsize]);
+  const unsigned int thresh_sad = cpi->sf.rt_sf.prune_palette_nonrd ? 100 : 20;
+  const unsigned int best_sad_norm =
+      args.best_sad >>
+      (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
   // Try palette if it's enabled.
   bool try_palette =
-      best_dist_norm > thresh_dist && cpi->oxcf.tool_cfg.enable_palette &&
-      bsize <= BLOCK_16X16 && x->source_variance > 200 &&
+      (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) &&
+      cpi->oxcf.tool_cfg.enable_palette && bsize <= BLOCK_16X16 &&
+      x->source_variance > 200 &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                         mi->bsize);
   if (try_palette) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 4ddd4903a8..671986600b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1579,6 +1579,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.nonrd_aggressive_skip = 1;
       sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
       sf->rt_sf.hybrid_intra_pickmode = 0;
+      sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
     if (speed >= 11) {
       sf->rt_sf.skip_lf_screen = 2;
@@ -1588,7 +1589,6 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.increase_color_thresh_palette = 0;
       sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
       sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
-      sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
     sf->rt_sf.skip_newmv_flat_blocks_screen = 1;
     sf->rt_sf.use_idtx_nonrd = 1;
-- 
GitLab


From d9ab67e87cddd933fff3030013d4f88a30c84f91 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 8 May 2024 17:05:57 -0700
Subject: [PATCH 109/391] Add a test that reproduces bug b:330639949

Bug: b:330639949
Change-Id: Idd74a30bfeda4baa9d0da6db4c24b29816237d2f
---
 test/wiener_test.cc | 146 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 90cf6bbf87..d018d8021b 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -1887,5 +1887,151 @@ TEST(SearchWienerTest, 10bitSignedIntegerOverflowInLinsolveWiener) {
   EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
 }
 
+// A test that reproduces b/330639949: signed integer overflow in
+// linsolve_wiener().
+TEST(SearchWienerTest, DISABLED_12bitSignedIntegerOverflowInLinsolveWiener) {
+  constexpr int kWidth = 173;
+  constexpr int kHeight = 3;
+  // Since the image format is YUV 4:2:0, aom_img_wrap() expects the buffer is
+  // allocated with width and height aligned to a multiple of 2. Align the
+  // width to a multiple of 2 so that the stride set by aom_img_wrap() is
+  // correct. It is not necessary to align the height to a multiple of 2
+  // because aom_codec_encode() will only read cfg.g_h rows.
+  static constexpr uint16_t kBuffer[(kWidth + 1) * kHeight] = {
+    // Y plane:
+    // Row:
+    0, 0, 369, 0, 4095, 873, 4095, 4095, 0, 571, 4023, 0, 1028, 58, 556, 0, 0,
+    1875, 16, 1043, 4095, 0, 1671, 1990, 0, 4095, 2932, 3117, 4095, 0, 0, 0,
+    4095, 4095, 4095, 4095, 4095, 4095, 508, 4095, 0, 0, 4095, 4095, 4095, 0,
+    4095, 4095, 0, 197, 4095, 1475, 1127, 4095, 0, 1570, 1881, 4095, 1215, 4095,
+    0, 0, 1918, 4095, 0, 4095, 3415, 0, 732, 122, 1087, 0, 0, 0, 0, 0, 1012,
+    4095, 0, 4095, 4095, 0, 0, 4095, 1931, 4095, 0, 4095, 4095, 4095, 4095, 570,
+    4095, 4095, 0, 2954, 0, 0, 0, 1925, 3802, 0, 4095, 55, 0, 4095, 760, 4095,
+    0, 3313, 4095, 4095, 4095, 0, 218, 799, 4095, 0, 4095, 2455, 4095, 0, 0,
+    611, 4095, 3060, 1669, 0, 0, 4095, 3589, 3903, 0, 3427, 1903, 0, 4095, 3789,
+    4095, 4095, 107, 2064, 4095, 2764, 4095, 0, 0, 0, 3498, 0, 0, 1336, 4095,
+    4095, 3480, 0, 545, 673, 4095, 0, 4095, 4095, 3175, 4095, 1623, 4095, 0,
+    540, 4095, 4095, 14, 429, 0, 0,
+    // Row:
+    0, 4095, 4095, 0, 1703, 3003, 968, 1313, 4095, 613, 4095, 3918, 112, 4095,
+    0, 4095, 2211, 88, 4051, 1203, 2005, 4095, 4095, 0, 2106, 0, 4095, 0, 4095,
+    4095, 4095, 0, 3261, 0, 4095, 0, 1184, 4095, 4095, 818, 4095, 0, 4095, 1292,
+    4095, 0, 4095, 4095, 0, 4095, 4095, 0, 0, 346, 906, 974, 4095, 4095, 4095,
+    4095, 0, 4095, 3225, 2547, 4095, 0, 0, 2705, 2933, 4095, 0, 0, 3579, 0,
+    4095, 4095, 4095, 1872, 4095, 298, 2961, 0, 0, 2805, 0, 0, 1210, 3773, 0,
+    1208, 3347, 0, 4095, 0, 0, 0, 4034, 4095, 0, 0, 4095, 0, 0, 0, 3302, 0, 0,
+    0, 0, 0, 4095, 4095, 0, 2609, 4095, 0, 1831, 4095, 0, 2463, 4095, 4095,
+    4095, 4095, 752, 4095, 4095, 41, 1829, 2975, 227, 2505, 2719, 1059, 4071,
+    4095, 4095, 3859, 0, 0, 0, 0, 4095, 2423, 4095, 4095, 4095, 4095, 4095,
+    1466, 0, 0, 4095, 121, 0, 0, 4095, 0, 0, 3328, 4095, 4095, 0, 1172, 0, 2938,
+    0, 4095, 0, 0, 0, 4095, 1821, 0,
+    // Row:
+    4095, 4095, 4095, 4095, 3487, 4095, 0, 0, 0, 3367, 4095, 4095, 1139, 4095,
+    4095, 169, 1300, 1840, 4095, 3508, 4095, 618, 4095, 4095, 4095, 53, 4095,
+    4095, 4095, 4095, 4055, 0, 0, 0, 4095, 4095, 0, 0, 0, 0, 1919, 2415, 1485,
+    458, 4095, 4095, 3176, 4095, 0, 0, 4095, 4095, 617, 3631, 4095, 4095, 0, 0,
+    3983, 4095, 4095, 681, 1685, 4095, 4095, 0, 1783, 25, 4095, 0, 0, 4095,
+    4095, 0, 2075, 0, 4095, 4095, 4095, 0, 773, 3407, 0, 4095, 4095, 0, 0, 4095,
+    4095, 4095, 4095, 4095, 0, 0, 0, 0, 4095, 0, 1804, 0, 0, 3169, 3576, 502, 0,
+    0, 4095, 0, 4095, 0, 4095, 4095, 4095, 0, 4095, 779, 0, 4095, 0, 0, 0, 4095,
+    0, 0, 4095, 4095, 4095, 4095, 0, 0, 4095, 4095, 2134, 4095, 4020, 2990,
+    3949, 4095, 4095, 4095, 4095, 4095, 0, 4095, 4095, 2829, 4095, 4095, 4095,
+    0, 197, 2328, 3745, 0, 3412, 190, 4095, 4095, 4095, 2809, 3953, 0, 4095,
+    1502, 2514, 3866, 0, 0, 4095, 4095, 1878, 129, 4095, 0
+  };
+  unsigned char *img_data =
+      reinterpret_cast<unsigned char *>(const_cast<uint16_t *>(kBuffer));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I42016, kWidth, kHeight, 1,
+                               img_data));
+  img.cp = AOM_CICP_CP_UNSPECIFIED;
+  img.tc = AOM_CICP_TC_UNSPECIFIED;
+  img.mc = AOM_CICP_MC_UNSPECIFIED;
+  img.monochrome = 1;
+  img.csp = AOM_CSP_UNKNOWN;
+  img.range = AOM_CR_FULL_RANGE;
+  img.planes[1] = img.planes[2] = nullptr;
+  img.stride[1] = img.stride[2] = 0;
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+  cfg.rc_end_usage = AOM_Q;
+  cfg.g_profile = 2;
+  cfg.g_bit_depth = AOM_BITS_12;
+  cfg.g_input_bit_depth = 12;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  cfg.g_lag_in_frames = 0;
+  cfg.g_threads = 18;
+  cfg.monochrome = 1;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 51;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 25));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 4));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // Encode frame
+  EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Encode frame
+  EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x20000.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Encode frame
+  EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x20000.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Encode frame
+  EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x20000.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
 }  // namespace wiener_highbd
 #endif  // CONFIG_AV1_HIGHBITDEPTH
-- 
GitLab


From f1b43b5c0d0c98a37713e9939a782ebe014c1d1f Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 9 May 2024 12:09:57 +0000
Subject: [PATCH 110/391] {,highbd_}intrapred_neon.c: Avoid over-reads in z1
 and z3 preds

The existing z1 and z3 predictors already contain checks to see if the
first element of the vector would over-read, however this is not
sufficient since the vector may straddle the end of the input array.

To get around this, add an additional check against the end of the
array. If we would over-read, load a full vector up to the end of the
array and then use TBL to shuffle the data into the correct place. This
also means that we no longer need the compare and BSL at the end of each
loop iteration to select between the computed data or the value of the
last element duplicated.

Bug: aomedia:3571
Change-Id: I03e2313b9bf0b44d64811fff1bedf4eb7381518a
---
 aom_dsp/arm/highbd_intrapred_neon.c | 108 ++++++++++++++++++++--------
 aom_dsp/arm/intrapred_neon.c        |  65 +++++++++++++----
 2 files changed, 133 insertions(+), 40 deletions(-)

diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index d5a0044d6c..eff773b85a 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -1293,6 +1293,33 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
       highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
 }
 
+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
+#if AOM_ARCH_AARCH64
+  return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
+#endif
+}
+
 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
                                                    ptrdiff_t stride, int bw,
                                                    int bh,
@@ -1336,13 +1363,26 @@ static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
     } else {
       int c = 0;
       do {
-        const uint16x8_t a0 = vld1q_u16(&above[base + c]);
-        const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
-        const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
-        const uint16x8_t cmp =
-            vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
-        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
-        vst1q_u16(dst + c, res);
+        uint16x8_t a0;
+        uint16x8_t a1;
+        if (base + c >= max_base_x) {
+          a0 = a1 = vdupq_n_u16(above_max);
+        } else {
+          if (base + c + 7 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c;
+            a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a0 = vld1q_u16(above + base + c);
+          }
+          if (base + c + 8 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c - 1;
+            a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a1 = vld1q_u16(above + base + c + 1);
+          }
+        }
+
+        vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
         c += 8;
       } while (c < bw);
     }
@@ -2456,13 +2496,29 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
     val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
     uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
     val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
-    const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base));          \
-    const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),    \
-                                        vrshrn_n_u32(val_hi, (shift)));   \
-    *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res,      \
-                       vdupq_n_u16(left_max));                            \
+    *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
+                          vrshrn_n_u32(val_hi, (shift)));                 \
   } while (0)
 
+static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
+                                             int max_ofs) {
+  uint16x8_t r0;
+  uint16x8_t r1;
+  if (ofs + 7 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs;
+    r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r0 = vld1q_u16(left0 + ofs);
+  }
+  if (ofs + 8 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs - 1;
+    r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r1 = vld1q_u16(left0 + ofs + 1);
+  }
+  return (uint16x8x2_t){ { r0, r1 } };
+}
+
 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
                                                    ptrdiff_t stride, int bw,
                                                    int bh, const uint16_t *left,
@@ -2561,34 +2617,30 @@ static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
         if (base0 >= max_base_y) {
           out[0] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l00 = vld1q_u16(left + base0);
-          const uint16x8_t l01 = vld1q_u16(left1 + base0);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
-                                         shifts0, shifts1, 0, 6);
+          const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
+                                         l0.val[1], shifts0, shifts1, 0, 6);
         }
         if (base1 >= max_base_y) {
           out[1] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l10 = vld1q_u16(left + base1);
-          const uint16x8_t l11 = vld1q_u16(left1 + base1);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
-                                         shifts0, shifts1, 1, 6);
+          const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
+                                         l1.val[1], shifts0, shifts1, 1, 6);
         }
         if (base2 >= max_base_y) {
           out[2] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l20 = vld1q_u16(left + base2);
-          const uint16x8_t l21 = vld1q_u16(left1 + base2);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
-                                         shifts0, shifts1, 2, 6);
+          const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
+                                         l2.val[1], shifts0, shifts1, 2, 6);
         }
         if (base3 >= max_base_y) {
           out[3] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l30 = vld1q_u16(left + base3);
-          const uint16x8_t l31 = vld1q_u16(left1 + base3);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
-                                         shifts0, shifts1, 3, 6);
+          const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
+                                         l3.val[1], shifts0, shifts1, 3, 6);
         }
         transpose_array_inplace_u16_4x8(out);
         for (int r2 = 0; r2 < 4; ++r2) {
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 55d7eb13a7..f024c4fe53 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1357,6 +1357,41 @@ static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
+   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
+   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
+   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vld1q_u8(ptr);
+#if AOM_ARCH_AARCH64
+  return vqtbl1q_u8(src, shuffle);
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vcombine_u8(lo, hi);
+#endif
+}
+
 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                        const uint8_t *above, int dx) {
   const int frac_bits = 6;
@@ -1370,7 +1405,6 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 
   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
-  const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
 
   int x = dx;
   for (int r = 0; r < N; r++, dst += stride) {
@@ -1392,12 +1426,24 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                                vcreate_u8(0x0F0E0D0C0B0A0908)));
 
     for (int j = 0; j < 64; j += 16) {
-      int mdif = max_base_x - (base + j);
-      if (mdif <= 0) {
+      if (base + j >= max_base_x) {
         vst1q_u8(dst + j, a_mbase_x);
       } else {
-        uint8x16_t a0_128 = vld1q_u8(above + base + j);
-        uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+        uint8x16_t a0_128;
+        uint8x16_t a1_128;
+        if (base + j + 15 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j;
+          a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a0_128 = vld1q_u8(above + base + j);
+        }
+        if (base + j + 16 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j - 1;
+          a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a1_128 = vld1q_u8(above + base + j + 1);
+        }
+
         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
         uint16x8_t diff_hi =
             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
@@ -1407,13 +1453,8 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
-        uint8x16_t v_temp =
-            vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
-
-        uint8x16_t mask128 =
-            vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
-        uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
-        vst1q_u8(dst + j, res128);
+        vst1q_u8(dst + j,
+                 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
 
         base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
       }
-- 
GitLab


From 753413f8a171b3594a7d0da0aa157981a910d253 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 10:39:30 -0700
Subject: [PATCH 111/391] av1_dec_fuzzer: add aom_codec_peek_stream_info
 coverage

Change-Id: I511539292cb8c2098c81f5fe3d711b9739482ffa
---
 examples/av1_dec_fuzzer.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc
index e9388b7062..6f3305e017 100644
--- a/examples/av1_dec_fuzzer.cc
+++ b/examples/av1_dec_fuzzer.cc
@@ -67,8 +67,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     data += IVF_FRAME_HDR_SZ;
     frame_size = std::min(size, frame_size);
 
-    const aom_codec_err_t err =
-        aom_codec_decode(&codec, data, frame_size, nullptr);
+    aom_codec_stream_info_t stream_info;
+    aom_codec_err_t err =
+        aom_codec_peek_stream_info(codec_interface, data, size, &stream_info);
+    static_cast<void>(err);
+
+    err = aom_codec_decode(&codec, data, frame_size, nullptr);
     static_cast<void>(err);
     aom_codec_iter_t iter = nullptr;
     aom_image_t *img = nullptr;
-- 
GitLab


From c319f69f9c4a69850654d4934cc339a0b5c01ceb Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 11:45:21 -0700
Subject: [PATCH 112/391] avg_intrin_sse2: use xx_loadl_32() for unaligned int
 loads

This quiets some undefined sanitizer warnings related to unaligned
loads; no change in assembly with gcc-13.

Change-Id: I770e5b9aab77cf91280fd6608efe922a21f382c3
---
 aom_dsp/x86/avg_intrin_sse2.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 0b552b704b..7ff2801026 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 
 static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
@@ -171,10 +172,8 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
-                          _mm_cvtsi32_si128(*(const int *)(s + p)));
-  s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
-                          _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+  s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p));
+  s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3));
   s0 = _mm_sad_epu8(s0, u0);
   s1 = _mm_sad_epu8(s1, u0);
   s0 = _mm_add_epi16(s0, s1);
-- 
GitLab


From 5c229f826a5770e4dc0354f534f0db9c2def326e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 12:05:32 -0700
Subject: [PATCH 113/391] obmc_intrinsic_sse4: use xx_loadl_32() for unaligned
 int loads

This quiets some undefined sanitizer warnings related to unaligned
loads; no change in assembly with gcc-13.

Change-Id: I8fcb3f24031a763183b149658be0f65763f04383
---
 aom_dsp/x86/obmc_intrinsic_sse4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index 210f466b6f..fbed23596c 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -15,6 +15,7 @@
 #include <smmintrin.h>
 
 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
 
 static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
                                     const int32_t *wsrc, const int32_t *mask,
@@ -28,7 +29,7 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
   assert(IS_POWER_OF_TWO(h));
 
   do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
+    const __m128i v_p_b = xx_loadl_32(pre + n);
     const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
     const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
 
-- 
GitLab


From 24bcf570a609471585ca3cb989076b9ef221f461 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 12:19:28 -0700
Subject: [PATCH 114/391] convolve_sse2: use xx_loadl_32() for unaligned int
 loads

This quiets some undefined sanitizer warnings related to unaligned
loads; no major changes in assembly with gcc-13 (some register changes,
instruction reordering).

Change-Id: I2e8ac7f40caec56f204440a39116745e2a9a1fe2
---
 av1/common/x86/convolve_sse2.c | 39 ++++++++++++++--------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 6383567a48..4787d3f1df 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -16,6 +16,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "av1/common/convolve.h"
 
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
@@ -200,31 +201,23 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
-      s[0] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
-      s[1] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
-      s[2] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
-      s[3] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
-      s[4] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
-      s[5] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+      s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+                               xx_loadl_32(src_ptr + 1 * src_stride));
+      s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+                               xx_loadl_32(src_ptr + 2 * src_stride));
+      s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+                               xx_loadl_32(src_ptr + 3 * src_stride));
+      s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+                               xx_loadl_32(src_ptr + 4 * src_stride));
+      s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+                               xx_loadl_32(src_ptr + 5 * src_stride));
+      s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
       do {
-        s[6] = _mm_unpacklo_epi8(
-            src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
-        src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
-        s[7] = _mm_unpacklo_epi8(
-            _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+        s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+        src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+        s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
 
         res = convolve_lo_y(s + 0, coeffs);
         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-- 
GitLab


From af5b35da47aa6550216346d171351efe30ac14d2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 12:21:01 -0700
Subject: [PATCH 115/391] jnt_convolve_sse2: use xx_loadl_32() for unaligned
 int loads

This quiets some undefined sanitizer warnings related to unaligned
loads; no major changes in assembly with gcc-13 (some register changes,
instruction reordering).

Change-Id: I7cccdb954ac08bdb061d98fa3e3809e487e4c90a
---
 av1/common/x86/jnt_convolve_sse2.c | 39 ++++++++++++------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index d5d2db7455..338615058c 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -15,6 +15,7 @@
 
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
 
 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
@@ -178,31 +179,23 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
-    src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+    s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+                             xx_loadl_32(src_ptr + 1 * src_stride));
+    s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+                             xx_loadl_32(src_ptr + 2 * src_stride));
+    s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+                             xx_loadl_32(src_ptr + 3 * src_stride));
+    s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+                             xx_loadl_32(src_ptr + 4 * src_stride));
+    s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+                             xx_loadl_32(src_ptr + 5 * src_stride));
+    s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
     do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+      s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+      src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+      s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
 
       res = convolve_lo_y(s + 0, coeffs);
       res_shift = _mm_sll_epi32(res, left_shift);
-- 
GitLab


From b2f80f721fe87547e3bae73be550e3d75ba478ad Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 May 2024 11:01:58 -0700
Subject: [PATCH 116/391] resize_sse2.c: fix integer sanitizer conversion
 warning

add missing cast in call to _mm_set1_epi8().

This is the same change that was made in resize_avx2.c:
ae7e8cc435 resize_avx2.c: fix integer sanitizer conversion warning

fixes:
implicit conversion from type 'uint8_t' (aka 'unsigned char') of value
255 (8-bit, unsigned) to type 'char' changed the value to -1 (8-bit,
signed)

Change-Id: I0fcddfb8003f659eceaafc8fdbfae4dea723fbc6
---
 av1/common/x86/resize_sse2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index c68371cb06..f0470a3f3c 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -102,7 +102,7 @@ bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   const uint8_t max_pixel = 255;
-  const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
+  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
   const __m128i zero = _mm_setzero_si128();
   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
 
-- 
GitLab


From 134c6e5a62eea4b4e86b30cf3231cb3ac3d4cbf7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 12:32:08 -0700
Subject: [PATCH 117/391] Remove aom_ports/msvc.h

The aom_ports/msvc.h header provides snprintf() and round() for MSVC
older than Visual Studio 2015 and Visual Studio 2013, respectively. It
also provides _mm256_insert_epi32, _mm256_extract_epi32 and
_mm256_insert_epi16 for versions prior to 2017.

Since the project now requires Visual Studio 2019 or later, it is
safe to remove aom_ports/msvc.h.

This matches the change made in libvpx:
f65aff7b9 Remove vpx_ports/msvc.h

Change-Id: I2fe4c41eaa126f4cf17639c11895f1e464294c76
---
 aom_ports/aom_ports.cmake |  2 +-
 aom_ports/bitops.h        |  1 -
 aom_ports/msvc.h          | 75 ---------------------------------------
 common/args.c             |  1 -
 common/tools_common.h     |  1 -
 common/y4minput.c         |  1 -
 test/hbd_metrics_test.cc  |  1 -
 7 files changed, 1 insertion(+), 81 deletions(-)
 delete mode 100644 aom_ports/msvc.h

diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake
index 8fd2ffd078..6df2bf020b 100644
--- a/aom_ports/aom_ports.cmake
+++ b/aom_ports/aom_ports.cmake
@@ -18,7 +18,7 @@ list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
             "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
             "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
             "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-            "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
+            "${AOM_ROOT}/aom_ports/sanitizer.h")
 
 list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
 
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 0795855083..7db4cde90b 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -15,7 +15,6 @@
 #include <assert.h>
 #include <stdint.h>
 
-#include "aom_ports/msvc.h"
 #include "config/aom_config.h"
 
 #ifdef _MSC_VER
diff --git a/aom_ports/msvc.h b/aom_ports/msvc.h
deleted file mode 100644
index e78e605f2f..0000000000
--- a/aom_ports/msvc.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_PORTS_MSVC_H_
-#define AOM_AOM_PORTS_MSVC_H_
-#ifdef _MSC_VER
-
-#include "config/aom_config.h"
-
-#if _MSC_VER < 1900  // VS2015 provides snprintf
-#define snprintf _snprintf
-#endif  // _MSC_VER < 1900
-
-#if _MSC_VER < 1800  // VS2013 provides round
-#include <math.h>
-static INLINE double round(double x) {
-  if (x < 0)
-    return ceil(x - 0.5);
-  else
-    return floor(x + 0.5);
-}
-
-static INLINE float roundf(float x) {
-  if (x < 0)
-    return (float)ceil(x - 0.5f);
-  else
-    return (float)floor(x + 0.5f);
-}
-
-static INLINE long lroundf(float x) {
-  if (x < 0)
-    return (long)(x - 0.5f);
-  else
-    return (long)(x + 0.5f);
-}
-#endif  // _MSC_VER < 1800
-
-#if HAVE_AVX
-#include <immintrin.h>
-// Note:
-// _mm256_insert_epi16 intrinsics is available from vs2017.
-// We define this macro for vs2015 and earlier. The
-// intrinsics used here are in vs2015 document:
-// https://msdn.microsoft.com/en-us/library/hh977022.aspx
-// Input parameters:
-// a: __m256i,
-// d: int16_t,
-// indx: imm8 (0 - 15)
-#if _MSC_VER <= 1900
-#define _mm256_insert_epi16(a, d, indx)                                      \
-  _mm256_insertf128_si256(                                                   \
-      a,                                                                     \
-      _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
-      indx >> 3)
-
-static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
-  return a.m256i_i32[i & 7];
-}
-static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
-  __m256i c = a;
-  c.m256i_i32[i & 7] = b;
-  return c;
-}
-#endif  // _MSC_VER <= 1900
-#endif  // HAVE_AVX
-#endif  // _MSC_VER
-#endif  // AOM_AOM_PORTS_MSVC_H_
diff --git a/common/args.c b/common/args.c
index b5ede193b5..c380dde8a0 100644
--- a/common/args.c
+++ b/common/args.c
@@ -17,7 +17,6 @@
 #include <limits.h>
 
 #include "aom/aom_integer.h"
-#include "aom_ports/msvc.h"
 #include "aom/aom_codec.h"
 #include "common/tools_common.h"
 
diff --git a/common/tools_common.h b/common/tools_common.h
index 9d891d1561..cde21646ed 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -20,7 +20,6 @@
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/msvc.h"
 
 #if CONFIG_AV1_ENCODER
 #include "common/y4minput.h"
diff --git a/common/y4minput.c b/common/y4minput.c
index 1974d76f1f..6a8601edfb 100644
--- a/common/y4minput.c
+++ b/common/y4minput.c
@@ -17,7 +17,6 @@
 #include <string.h>
 
 #include "aom/aom_integer.h"
-#include "aom_ports/msvc.h"
 #include "y4minput.h"
 
 // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 71c816f1cc..59bca9bcec 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -23,7 +23,6 @@
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/msvc.h"
 #include "aom_scale/yv12config.h"
 
 using libaom_test::ACMRandom;
-- 
GitLab


From e06f94e15859b20c00b4313eaad0171c25c21e81 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 16:56:21 -0700
Subject: [PATCH 118/391] remove {xx,yy}_set1_64_from_32i & xx_set_64_from_32i

These were needed for compatibility with Visual Studio versions prior to
2015. The project now requires 2019 or later.

Change-Id: I33b3f0be5ee86ab573a4764553b606f5eb2c14c9
---
 aom_dsp/x86/highbd_sad_avx2.c              |  2 +-
 aom_dsp/x86/sum_squares_avx2.c             |  2 +-
 aom_dsp/x86/sum_squares_sse2.c             |  6 +++---
 aom_dsp/x86/synonyms.h                     | 22 ----------------------
 aom_dsp/x86/synonyms_avx2.h                | 11 -----------
 av1/common/x86/reconinter_avx2.c           |  2 +-
 av1/common/x86/reconinter_ssse3.c          |  2 +-
 av1/encoder/x86/av1_highbd_quantize_sse4.c |  9 +++++----
 av1/encoder/x86/wedge_utils_avx2.c         |  2 +-
 av1/encoder/x86/wedge_utils_sse2.c         |  2 +-
 10 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index 6c78eeeefb..8b3045a610 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -551,7 +551,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(~0);
+  const __m256i mask = _mm256_set1_epi64x(~0u);
   __m128i sad;
 
   // 8 32-bit summation
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 89b9b824bf..c748a7dcce 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -21,7 +21,7 @@ static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
                                                 int width, int height) {
   uint64_t result;
   __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
+  const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u);
   for (int col = 0; col < height; col += 4) {
     __m256i v_acc_d = _mm256_setzero_si256();
     for (int row = 0; row < width; row += 16) {
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index cf3ed98974..6c34c44317 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -84,7 +84,7 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
     src += stride << 2;
     r += 4;
   } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
   __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
                                    _mm_and_si128(v_acc_q, v_zext_mask_q));
   v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
@@ -116,7 +116,7 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
   int r = 0;
 
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
   __m128i v_acc_q = _mm_setzero_si128();
 
   do {
@@ -254,7 +254,7 @@ uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index 0d51cdff48..ab13446b8d 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -70,28 +70,6 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
   _mm_storeu_si128((__m128i *)a, v);
 }
 
-// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set_epi64x()
-// acting on 32-bit integers.
-static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, e1, 0, e0);
-#else
-  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
-#endif
-}
-
-// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, a, 0, a);
-#else
-  return _mm_set1_epi64x((uint32_t)a);
-#endif
-}
-
 // Fill an SSE register using an interleaved pair of values, ie. set the
 // 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
 // as when a register is stored to / loaded from memory.
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index d4e8f69111..d78f4e6f98 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -53,17 +53,6 @@ static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
   return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
 }
 
-// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm256_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
-#else
-  return _mm256_set1_epi64x((uint32_t)a);
-#endif
-}
-
 // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
 // therefore define an equivalent function using a different intrinsic.
 // ([ hi ], [ lo ]) -> [ hi ][ lo ]
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c
index 71fab7a577..4bc5aa41c3 100644
--- a/av1/common/x86/reconinter_avx2.c
+++ b/av1/common/x86/reconinter_avx2.c
@@ -576,7 +576,7 @@ void av1_build_compound_diffwtd_mask_highbd_avx2(
         }
       }
     } else {
-      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
       if (mask_type == DIFFWTD_38_INV) {
         for (int i = 0; i < h; ++i) {
           for (int j = 0; j < w; j += 16) {
diff --git a/av1/common/x86/reconinter_ssse3.c b/av1/common/x86/reconinter_ssse3.c
index c9a3709a62..b177958b83 100644
--- a/av1/common/x86/reconinter_ssse3.c
+++ b/av1/common/x86/reconinter_ssse3.c
@@ -76,7 +76,7 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(
         }
       }
     } else {
-      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
       if (mask_type == DIFFWTD_38_INV) {
         for (int i = 0; i < h; ++i) {
           for (int j = 0; j < w; j += 8) {
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 40b3b460b6..f3a0b15de5 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -138,8 +138,9 @@ void av1_highbd_quantize_fp_sse4_1(
   const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
 
   qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
-  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
-  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]);
+  qparam[2] =
+      _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]);
   qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
                             dequant_ptr[0]);
 
@@ -149,8 +150,8 @@ void av1_highbd_quantize_fp_sse4_1(
 
   // update round/quan/dquan for AC
   qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
-  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
-  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]);
+  qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]);
   qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
   quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
                         quanAddr, dquanAddr);
diff --git a/av1/encoder/x86/wedge_utils_avx2.c b/av1/encoder/x86/wedge_utils_avx2.c
index 9cde860534..3f61c023c8 100644
--- a/av1/encoder/x86/wedge_utils_avx2.c
+++ b/av1/encoder/x86/wedge_utils_avx2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
-  const __m256i v_zext_q = yy_set1_64_from_32i(~0);
+  const __m256i v_zext_q = _mm256_set1_epi64x(~0u);
 
   __m256i v_acc0_q = _mm256_setzero_si256();
 
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
index d7ac2223f2..c3005790f2 100644
--- a/av1/encoder/x86/wedge_utils_sse2.c
+++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
-  const __m128i v_zext_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_q = _mm_set1_epi64x(~0u);
 
   __m128i v_acc0_q = _mm_setzero_si128();
 
-- 
GitLab


From cf61d393e9c8c70b739e3a4e27c955d82e0c4e9a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 May 2024 13:47:47 -0700
Subject: [PATCH 119/391] {jnt,}convolve_sse2: move load closer to first use

generates mildly better assembly with gcc-13 and clang-16.

Change-Id: I1e8fb2a6407e292c15e44dc7dd2676bad9a69857
---
 av1/common/x86/convolve_sse2.c     | 2 +-
 av1/common/x86/jnt_convolve_sse2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 4787d3f1df..9272e91b54 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -201,7 +201,6 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
       s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
                                xx_loadl_32(src_ptr + 1 * src_stride));
       s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -212,6 +211,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                                xx_loadl_32(src_ptr + 4 * src_stride));
       s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
                                xx_loadl_32(src_ptr + 5 * src_stride));
+      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
       s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
       do {
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 338615058c..6b1227890a 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -179,7 +179,6 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
-    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
     s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
                              xx_loadl_32(src_ptr + 1 * src_stride));
     s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -190,6 +189,7 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
                              xx_loadl_32(src_ptr + 4 * src_stride));
     s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
                              xx_loadl_32(src_ptr + 5 * src_stride));
+    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
     s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
     do {
-- 
GitLab


From 8834584f65e9058586be7a448786a1fa5e1513a4 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 10 May 2024 15:10:27 -0700
Subject: [PATCH 120/391] Assert an aom_img_set_rect call always succeed

The aom_img_set_rect() call at the end of img_alloc_helper() always
succeeds, so assert its return value is equal to 0.

Change-Id: Ia43e967ad5d7dc837a7357f018593802a912cd53
---
 aom/src/aom_image.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 1d3b7df245..c29095cbc5 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -182,7 +182,9 @@ static aom_image_t *img_alloc_helper(
 
   /* Default viewport to entire image. (This aom_img_set_rect call always
    * succeeds.) */
-  aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  assert(ret == 0);
+  (void)ret;
   return img;
 
 fail:
-- 
GitLab


From 07f632387d57520d5024d7d9cfbdb268a3d2db90 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 9 May 2024 18:13:05 -0700
Subject: [PATCH 121/391] test.cmake: remove unneeded target link libraries

aom_av1_rc adds aom as a target link library, aom_gmock (which adds
aom_gtest) is unused, as is webm. Fixes a warning when linking on macOS:
ld: warning: ignoring duplicate libraries: 'libaom.a', 'libaom_gtest.a'

Change-Id: I6d881bade3f5d32fc971c3493cdd106069e92be3
---
 test/test.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test.cmake b/test/test.cmake
index 2631c9fb39..da144683ae 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -638,8 +638,7 @@ function(setup_aom_test_targets)
      AND NOT BUILD_SHARED_LIBS
      AND NOT CONFIG_REALTIME_ONLY)
     add_executable(test_aom_rc ${AOM_RC_TEST_SOURCES})
-    target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom aom_av1_rc
-                          aom_gtest aom_gmock webm)
+    target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom_av1_rc aom_gtest)
     set_property(TARGET test_aom_rc PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
     list(APPEND AOM_APP_TARGETS test_aom_rc)
   endif()
-- 
GitLab


From 283aaef609279f5f5a7164e367cef6038f6a735e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Sat, 11 May 2024 08:18:58 -0700
Subject: [PATCH 122/391] av1_dec_fuzzer: Initialize stream_info.is_annexb

The is_annexb member of aom_codec_stream_info_t must be properly
initialized.

Related to the libvpx CL
https://chromium-review.googlesource.com/c/webm/libvpx/+/5533523.

Bug: oss-fuzz:68930
Change-Id: I324d02ff1fd16541700ca5ecdb515e842e52bade
---
 examples/av1_dec_fuzzer.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc
index 6f3305e017..4634ca628a 100644
--- a/examples/av1_dec_fuzzer.cc
+++ b/examples/av1_dec_fuzzer.cc
@@ -68,6 +68,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     frame_size = std::min(size, frame_size);
 
     aom_codec_stream_info_t stream_info;
+    stream_info.is_annexb = is_annexb;
     aom_codec_err_t err =
         aom_codec_peek_stream_info(codec_interface, data, size, &stream_info);
     static_cast<void>(err);
-- 
GitLab


From fe50f29a1c31f109ced647c14a4c079194ca775a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 May 2024 19:51:36 -0700
Subject: [PATCH 123/391] av1_inv_txfm_neon.c: make some functions static

Fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I12489d1845ca8c6aa5035d0548b8bab86883b05c
---
 av1/common/arm/av1_inv_txfm_neon.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 4a66b9016a..f15d473560 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -3813,8 +3813,9 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
   }
 }
 
-void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                          int stride, TX_TYPE tx_type,
+                                          int eob) {
   (void)eob;
   TX_SIZE tx_size = TX_4X8;
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
@@ -3878,8 +3879,9 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                          int stride, TX_TYPE tx_type,
+                                          int eob) {
   (void)eob;
   TX_SIZE tx_size = TX_8X4;
   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
@@ -3943,8 +3945,9 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, int eob) {
   (void)eob;
   TX_SIZE tx_size = TX_4X16;
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
@@ -4007,8 +4010,9 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, int eob) {
   (void)eob;
   TX_SIZE tx_size = TX_16X4;
   DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
-- 
GitLab


From c349b1ebd23dce55cf23c5f8fa56d5a9416971b0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 25 Apr 2024 15:16:17 +0100
Subject: [PATCH 124/391] Add 4-tap path for av1_convolve_2d_horiz_sr_neon

Add 4-tap specialization for the horizontal pass of
av1_convolve_2d_sr_neon. This gives up to 30% uplift over using the
8-tap path.

Change-Id: I6c9b7be0e90661a36cef95db1daedc2fabd6a31e
---
 aom_dsp/arm/mem_neon.h         |  22 ++
 av1/common/arm/convolve_neon.c | 363 ++++++++++++++++++++-------------
 2 files changed, 243 insertions(+), 142 deletions(-)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index b1f6ebeb14..46aa16e61d 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -654,6 +654,13 @@ static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
   vst1q_s16(s, s3);
 }
 
+static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+}
+
 static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
                                 uint8x8_t *const s0, uint8x8_t *const s1,
                                 uint8x8_t *const s2, uint8x8_t *const s3,
@@ -1248,6 +1255,12 @@ static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
     memcpy(dst, &a, 8);                                            \
   } while (0)
 
+#define store_s16_4x1_lane(dst, src, lane)                        \
+  do {                                                            \
+    int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
+    memcpy(dst, &a, 8);                                           \
+  } while (0)
+
 // Store the low 16-bits from a single vector.
 static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
   store_u8_2x1_lane(dst, src, 0);
@@ -1307,9 +1320,18 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
   store_u16_4x1_lane(dst, src, 1);
 }
 
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
+                                          int16x8_t src) {
+  store_s16_4x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_s16_4x1_lane(dst, src, 1);
+}
+
 #undef store_u8_2x1_lane
 #undef store_u8_4x1_lane
 #undef store_u16_2x1_lane
 #undef store_u16_4x1_lane
+#undef store_s16_4x1_lane
 
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index bd11b7cf29..72a85893e8 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -1307,18 +1307,122 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
   } while (--h != 0);
 }
 
-static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
-                                         const int16x4_t s2, const int16x4_t s3,
+static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
                                          const int16x4_t filter,
-                                         const int16x4_t horiz_const) {
-  int16x4_t sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
+                                         const int16x8_t horiz_const) {
+  int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+  // We halved the filter values so -1 from right shift.
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
 
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshr_n_s16(sum, ROUND0_BITS - 1);
+static INLINE void convolve_2d_sr_horiz_4tap_neon(
+    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
+  const int bd = 8;
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
+
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+
+  if (w == 4) {
+    do {
+      uint8x8_t t01[4];
+      t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
+      t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
+      t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
+      t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
+
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
+
+      int16x8_t d01 =
+          convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
+
+      store_s16x4_strided_x2(dst, (int)dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      int16_t *d = dst;
+
+      do {
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
+
+        int16x8_t s0[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        int16x8_t s1[4];
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
+
+        int16x8_t d0 =
+            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+        int16x8_t d1 =
+            convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
+
+        store_s16_8x2(d, dst_stride, d0, d1);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 2);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t t0[4];
+        load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+
+        int16x8_t s0[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        int16x8_t d0 =
+            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
 }
 
 static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
@@ -1344,10 +1448,9 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
-                                             int16_t *im_block, int im_stride,
-                                             int w, int im_h,
-                                             const int16_t *x_filter_ptr) {
+static INLINE void convolve_2d_sr_horiz_8tap_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16_t *x_filter_ptr) {
   const int bd = 8;
 
   const uint8_t *src_ptr = src;
@@ -1355,149 +1458,119 @@ static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
   int dst_stride = im_stride;
   int height = im_h;
 
-  if (w <= 4) {
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                             (1 << ((ROUND0_BITS - 1) - 1)));
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
-
-    src_ptr += 2;
-
-    do {
-      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-      int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
+#if AOM_ARCH_AARCH64
+  while (height > 8) {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
 
-      vst1_s16(dst_ptr, d0);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
-  } else {
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                              (1 << ((ROUND0_BITS - 1) - 1)));
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-#if AOM_ARCH_AARCH64
-    while (height > 8) {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+    s += 7;
 
-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    do {
       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-      s += 7;
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      horiz_const);
+      int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                      horiz_const);
+      int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                      horiz_const);
+      int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                      horiz_const);
+      int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      x_filter, horiz_const);
+      int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      x_filter, horiz_const);
+      int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      x_filter, horiz_const);
+      int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      x_filter, horiz_const);
+
+      transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        x_filter, horiz_const);
-        int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        x_filter, horiz_const);
-        int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        x_filter, horiz_const);
-        int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        x_filter, horiz_const);
-        int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
-                                        x_filter, horiz_const);
-        int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
-                                        x_filter, horiz_const);
-        int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
-                                        x_filter, horiz_const);
-        int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
-                                        x_filter, horiz_const);
-
-        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      height -= 8;
-    }
+      s0 = s8;
+      s1 = s9;
+      s2 = s10;
+      s3 = s11;
+      s4 = s12;
+      s5 = s13;
+      s6 = s14;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 8 * src_stride;
+    dst_ptr += 8 * dst_stride;
+    height -= 8;
+  }
 #endif  // AOM_ARCH_AARCH64
 
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
 
-      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
 
-      do {
-        uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    do {
+      uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
 
-        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        x_filter, horiz_const);
+      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      horiz_const);
 
-        vst1q_s16(d, d0);
+      vst1q_s16(d, d0);
 
-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
-  }
+      s0 = s8;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
 }
 
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -1514,6 +1587,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
@@ -1544,8 +1618,13 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-    convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
-                              x_filter_ptr);
+    if (x_filter_taps <= 4) {
+      convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
+                                     im_stride, w, im_h, x_filter_ptr);
+    } else {
+      convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
+                                     w, im_h, x_filter_ptr);
+    }
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-- 
GitLab


From 99a75dfcf8ff48d4f06875e208bc9c8681d6fac0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 25 Apr 2024 16:50:46 +0100
Subject: [PATCH 125/391] Add 4-tap path for
 av1_convolve_2d_horiz_sr_neon_dotprod

Add 4-tap specialization for the horizontal pass of
av1_convolve_2d_sr_neon_dotprod. This gives up to 10% uplift over using
the 8-tap path.

Change-Id: I49f674b8e525b7f4de8fc3ff5073f0dd30981b49
---
 av1/common/arm/convolve_neon_dotprod.c | 267 ++++++++++++++++---------
 1 file changed, 174 insertions(+), 93 deletions(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 393f2e81f9..3c85f3cb4b 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1162,26 +1162,141 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
                                          const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16_t permute_tbl) {
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  int8x16_t clamped_samples =
-      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+                                         const uint8x16_t permute_tbl,
+                                         const int32x4_t correction) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+  // Accumulate into 'correction' to account for range transform.
+  int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0);
 
   // We halved the convolution filter values so -1 from the right shift.
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
+static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const uint8x16x2_t permute_tbl,
+                                         const int32x4_t correction) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 'correction' to account for range transform.
+  int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
+  int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
+  const int bd = 8;
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Halve the total because we will halve the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
+      int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction);
+      int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction);
+      int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src);
+      int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
+      vst1_s16(dst, d0);
+
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
+        int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction);
+        int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction);
+        int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
+}
+
 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
@@ -1215,7 +1330,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum[1], ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_neon_dotprod(
+static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   const int bd = 8;
@@ -1235,97 +1350,57 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod(
   int dst_stride = im_stride;
   int height = im_h;
 
-  if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
 
-    src_ptr += 2;
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
 
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      int16x4_t d0 =
-          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d1 =
-          convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d2 =
-          convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d3 =
-          convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
-
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 4);
-
-    do {
-      uint8x16_t s0 = vld1q_u8(src_ptr);
-      int16x4_t d0 =
-          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
-      vst1_s16(dst_ptr, d0);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+      int16x8_t d0 =
+          convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x8_t d1 =
+          convolve8_8_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x8_t d2 =
+          convolve8_8_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x8_t d3 =
+          convolve8_8_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
 
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
-                                        permute_tbl);
-        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
-                                        permute_tbl);
-        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
-                                        permute_tbl);
-        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
-                                        permute_tbl);
+      store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
 
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 4);
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
 
     do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0 = vld1q_u8(s);
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
-                                        permute_tbl);
-        vst1q_s16(d, d0);
+      uint8x16_t s0 = vld1q_u8(s);
+      int16x8_t d0 =
+          convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      vst1q_s16(d, d0);
 
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
-  }
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
 }
 
 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
@@ -1343,6 +1418,7 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
   }
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
@@ -1374,8 +1450,13 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-    convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride,
-                                      w, im_h, x_filter_ptr);
+    if (x_filter_taps <= 4) {
+      convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block,
+                                             im_stride, w, im_h, x_filter_ptr);
+    } else {
+      convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block,
+                                             im_stride, w, im_h, x_filter_ptr);
+    }
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-- 
GitLab


From 4e9108ef508b481f65406538ad8a748066bd1f0c Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 May 2024 16:04:32 +0100
Subject: [PATCH 126/391] Add 4-tap path for av1_convolve_2d_horiz_sr_neon_i8mm

Add 4-tap specialization for the horizontal pass of
av1_convolve_2d_sr_neon_i8mm. This gives up to 10% uplift over using the
8-tap path.

Change-Id: I6d67190d424a51e077f9cfe8edf8c89b446780a3
---
 av1/common/arm/convolve_neon_i8mm.c | 201 +++++++++++++++++++---------
 1 file changed, 140 insertions(+), 61 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index f8b11eb358..b6a2a41ba0 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1060,21 +1060,6 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
   }
 }
 
-static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const uint8x16_t permute_tbl,
-                                         const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
-
-  // First 4 output values.
-  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshrn_n_s32(sum, ROUND0_BITS - 1);
-}
-
 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x3_t permute_tbl,
@@ -1103,7 +1088,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum[1], ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_neon_i8mm(
+static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   const int bd = 8;
@@ -1118,85 +1103,173 @@ static INLINE void convolve_2d_sr_horiz_neon_i8mm(
   int dst_stride = im_stride;
   int height = im_h;
 
-  if (w <= 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-    src_ptr += 2;
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
 
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x16_t s0 = vld1q_u8(s);
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1q_s16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const uint8x16_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-      int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-      int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
 
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
 
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
+static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const uint8x16x2_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  int32x4_t sum0123 =
+      vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+  int32x4_t sum4567 =
+      vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width,
+    int height, const int16_t *filter_x) {
+  const int bd = 8;
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Halve the total because we will halve the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32(
+      (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2));
+
+  if (width == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
+      int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const);
+      int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const);
+      int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       height -= 4;
     } while (height > 4);
 
     do {
-      uint8x16_t s0 = vld1q_u8(src_ptr);
-      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      vst1_s16(dst_ptr, d0);
+      uint8x16_t s0 = vld1q_u8(src);
+      int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
+      vst1_s16(dst, d0);
 
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
+      src += src_stride;
+      dst += dst_stride;
     } while (--height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
     do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      int w = width;
+      const uint8_t *s = src;
+      int16_t *d = dst;
 
       do {
         uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+        int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
+        int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const);
+        int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const);
+        int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const);
 
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       height -= 4;
     } while (height > 4);
 
     do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int w = width;
 
       do {
         uint8x16_t s0 = vld1q_u8(s);
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
         vst1q_s16(d, d0);
 
         s += 8;
         d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
+        w -= 8;
+      } while (w != 0);
+      src += src_stride;
+      dst += dst_stride;
     } while (--height != 0);
   }
 }
@@ -1215,6 +1288,7 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
   }
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
@@ -1246,8 +1320,13 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-    convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w,
-                                   im_h, x_filter_ptr);
+    if (x_filter_taps <= 4) {
+      convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block,
+                                          im_stride, w, im_h, x_filter_ptr);
+    } else {
+      convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                          im_stride, w, im_h, x_filter_ptr);
+    }
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-- 
GitLab


From 279722d6fef0a3117d8c4ce0804667c63aabc2ad Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 8 May 2024 17:22:20 +0100
Subject: [PATCH 127/391] Add 4-tap path for av1_convolve_2d_vert_sr_neon

Add a 4-tap Neon implementation for the vertical pass of
av1_convolve_2d_sr and use it for the neon, neon_dotprod and neon_i8mm
variants of the function. This gives up to 30% uplift over using the
6-tap implementation.

Change-Id: Ia61667cd54a79c352433fd190c9a1f94872c1efe
---
 av1/common/arm/convolve_neon.c         |   7 +-
 av1/common/arm/convolve_neon.h         | 108 +++++++++++++++++++++++++
 av1/common/arm/convolve_neon_dotprod.c |   7 +-
 av1/common/arm/convolve_neon_i8mm.c    |   7 +-
 4 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 72a85893e8..70cf23be06 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -1588,7 +1588,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = clamped_y_taps / 2 - 1;
@@ -1628,7 +1628,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-    if (clamped_y_taps <= 6) {
+    if (clamped_y_taps <= 4) {
+      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_ptr);
+    } else if (clamped_y_taps == 6) {
       convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                     y_filter);
     } else {
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 9fbf8aa12f..5a9f8b6d39 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -535,4 +535,112 @@ static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
   }
 }
 
+static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t y_filter) {
+  int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter, 3);
+
+  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x4_t y_filter,
+                                         const int16x8_t sub_const) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
+
+  int16x8_t res =
+      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+  res = vsubq_s16(res, sub_const);
+
+  return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
+                                                 int src_stride,
+                                                 uint8_t *dst_ptr,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *y_filter) {
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  const int16x4_t filter = vld1_s16(y_filter + 2);
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
+    src_ptr += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
+
+      int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
+      int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
+      int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
+      int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    // Width is a multiple of 8 and height is a multiple of 4.
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
+        uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
+        uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
+        uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 3c85f3cb4b..b558744731 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1419,7 +1419,7 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = clamped_y_taps / 2 - 1;
@@ -1460,7 +1460,10 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-    if (clamped_y_taps <= 6) {
+    if (clamped_y_taps <= 4) {
+      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_ptr);
+    } else if (clamped_y_taps == 6) {
       convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                     y_filter);
     } else {
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index b6a2a41ba0..b2f489f0d4 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1289,7 +1289,7 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = clamped_y_taps / 2 - 1;
@@ -1330,7 +1330,10 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
 
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-    if (clamped_y_taps <= 6) {
+    if (clamped_y_taps <= 4) {
+      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_ptr);
+    } else if (clamped_y_taps == 6) {
       convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                     y_filter);
     } else {
-- 
GitLab


From 407b9866d40a58da32ed8de4f9102fe93eef6ee2 Mon Sep 17 00:00:00 2001
From: Cheng Chen <chengchen@google.com>
Date: Wed, 8 May 2024 15:25:17 -0700
Subject: [PATCH 128/391] Add a speed feature for screen content type in rtc

This speed feature filters blocks so that less blocks will go
through the palette search path.

Disabling this feature leads to better coding gain but higher
key frame encoding time.

For speed 10 and 11, the feature is turned off for faster
key frame encoding.

For rtc_screen test set, if disabling the feature, the coding
performance is
              psnr        ssim        vmaf        encoding_time
Speed 10    -1.334%     -2.233%     -1.420%        -0.261%
Speed 11    -2.610%     -4.344%     -2.601%        0.207%

Change-Id: I0d2a729c806f7b2780c749fa6df169fbeb2758b8
---
 av1/encoder/nonrd_pickmode.c | 11 ++++++++---
 av1/encoder/speed_features.c |  2 ++
 av1/encoder/speed_features.h |  6 ++++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index ed64056e40..d08e9c0ac9 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -23,6 +23,7 @@
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/palette.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
@@ -1655,11 +1656,15 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
 
   // Try palette if it's enabled.
   bool try_palette =
-      (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) &&
-      cpi->oxcf.tool_cfg.enable_palette && bsize <= BLOCK_16X16 &&
-      x->source_variance > 200 &&
+      cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                         mi->bsize);
+  if (cpi->sf.rt_sf.prune_screen_palette_search) {
+    bool prune =
+        (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) &&
+        bsize <= BLOCK_16X16 && x->source_variance > 200;
+    try_palette &= prune;
+  }
   if (try_palette) {
     const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     const unsigned int intra_ref_frame_cost = 0;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 671986600b..6fd01d8fc1 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1579,6 +1579,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.nonrd_aggressive_skip = 1;
       sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
       sf->rt_sf.hybrid_intra_pickmode = 0;
+      sf->rt_sf.prune_screen_palette_search = true;
       sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
     if (speed >= 11) {
@@ -2253,6 +2254,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->use_nonrd_filter_search = 0;
   rt_sf->use_simple_rd_model = 0;
   rt_sf->hybrid_intra_pickmode = 0;
+  rt_sf->prune_screen_palette_search = false;
   rt_sf->source_metrics_sb_nonrd = 0;
   rt_sf->overshoot_detection_cbr = NO_DETECTION;
   rt_sf->check_scene_detection = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d6b2949277..300d7e190e 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1629,6 +1629,12 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise
   int hybrid_intra_pickmode;
 
+  // Filter blocks by certain criteria such as SAD, source variance, such that
+  // fewer blocks will go through the palette search.
+  // For screen content types, enable this feature reduces key frame encoding
+  // time. Disabling it leads to better compression efficiency.
+  bool prune_screen_palette_search;
+
   // Compute variance/sse on source difference, prior to encoding superblock.
   int source_metrics_sb_nonrd;
 
-- 
GitLab


From 06f3b3bae25956313f5dd51cedf39b2e04713264 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 9 May 2024 22:26:27 -0700
Subject: [PATCH 129/391] rtc: Adjust color_thresh_palette for nonrd keyframe

For keyframes with hybrid_intra_pickmode = 0:
reduce color_thresh_palette down, to exit palette search
early when it's likely not the best mode.

bdrate on rtc_screen for speed 11, all-intra with CQ mode:
avg_psnr/ovr_psnr/ssim:-6.3, -5.3, -7.1
Average Instruction count reduction ~17%.

For the default IPPP CBR mode:
avg_psnr/ovr_psnr/ssim: 0.153, 0.619, 0.427
Some loss on screen_recording_crd clip, that will
be looked into.

Change-Id: I2fd8acb674d09c34629f08f6e593ec97a807d9d4
---
 av1/encoder/block.h          | 3 +++
 av1/encoder/encodeframe.c    | 1 +
 av1/encoder/nonrd_pickmode.c | 2 ++
 av1/encoder/palette.c        | 2 +-
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 1baf3f942e..9bee0b8d02 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1325,6 +1325,9 @@ typedef struct macroblock {
   //! Coding block distortion value for uv/color, minimum over the inter modes.
   int64_t min_dist_inter_uv;
 
+  //! Threshold on the number of colors for testing palette mode.
+  int color_palette_thresh;
+
   //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
   // so we can keep dqcoeff of the best tx_type.
   tran_low_t *dqcoeff_buf;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 07382eb6cc..4c178b18c0 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1211,6 +1211,7 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
     x->sb_me_partition = 0;
     x->sb_me_mv.as_int = 0;
     x->sb_force_fixed_part = 1;
+    x->color_palette_thresh = 64;
 
     if (cpi->oxcf.mode == ALLINTRA) {
       x->intra_sb_rdmult_modifier = 128;
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index d08e9c0ac9..7b204c78a7 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1668,6 +1668,8 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
   if (try_palette) {
     const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     const unsigned int intra_ref_frame_cost = 0;
+    x->color_palette_thresh = (best_sad_norm < 500) ? 32 : 64;
+
     // Search palette mode for Luma plane in intra frame.
     av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
                                  &this_rdc, best_rdc.rdcost);
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 45b56199c6..6ae1c6cf63 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -564,7 +564,7 @@ void av1_rd_pick_palette_intra_sby(
   }
 
   uint8_t *const color_map = xd->plane[0].color_index_map;
-  int color_thresh_palette = 64;
+  int color_thresh_palette = x->color_palette_thresh;
   // Allow for larger color_threshold for palette search, based on color,
   // scene_change, and block source variance.
   // Since palette is Y based, only allow larger threshold if block
-- 
GitLab


From 24e42c92e4aadc216cd0b366356e4d0e85771d6b Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Mon, 13 May 2024 11:53:23 +0100
Subject: [PATCH 130/391] Remove unit tests for no-op filter in convolutions

Unit tests loop across the whole list of filters for each filter type,
which includes the no-op filter when sub_x or sub_y = 0. In practice,
however, the convolution functions are never called for such filters
(convolve_copy will be used instead), so remove these test cases. The
code accommodating this special no-op filter in the Neon paths will be
removed in a subsequent patch.

Change-Id: Ib363f15b35b1bdec796e4e8f490b9fae45328f54
---
 test/av1_convolve_test.cc | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 26a4b5e5d5..382a2fea63 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -325,7 +325,8 @@ typedef void (*convolve_x_func)(const uint8_t *src, int src_stride,
 class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> {
  public:
   void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+    // Do not test the no-op filter.
+    for (int sub_x = 1; sub_x < 16; ++sub_x) {
       for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
@@ -530,7 +531,8 @@ typedef void (*highbd_convolve_x_func)(
 class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
  public:
   void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+    // Do not test the no-op filter.
+    for (int sub_x = 1; sub_x < 16; ++sub_x) {
       for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
@@ -737,7 +739,8 @@ typedef void (*convolve_y_func)(const uint8_t *src, int src_stride,
 class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> {
  public:
   void RunTest() {
-    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+    // Do not test the no-op filter.
+    for (int sub_y = 1; sub_y < 16; ++sub_y) {
       for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
@@ -923,7 +926,8 @@ typedef void (*highbd_convolve_y_func)(
 class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
  public:
   void RunTest() {
-    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+    // Do not test the no-op filter.
+    for (int sub_y = 1; sub_y < 16; ++sub_y) {
       for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
@@ -1203,8 +1207,9 @@ typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
 class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> {
  public:
   void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
-      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+    // Do not test the no-op filter.
+    for (int sub_x = 1; sub_x < 16; ++sub_x) {
+      for (int sub_y = 1; sub_y < 16; ++sub_y) {
         for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
           for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
             if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
@@ -1429,8 +1434,9 @@ class AV1Convolve2DHighbdTest
     : public AV1ConvolveTest<highbd_convolve_2d_func> {
  public:
   void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
-      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+    // Do not test the no-op filter.
+    for (int sub_x = 1; sub_x < 16; ++sub_x) {
+      for (int sub_y = 1; sub_y < 16; ++sub_y) {
         for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
           for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
             if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
@@ -1781,7 +1787,8 @@ class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
  public:
   void RunTest() {
     auto compound_params = GetCompoundParams();
-    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+    // Do not test the no-op filter.
+    for (int sub_pix = 1; sub_pix < 16; ++sub_pix) {
       for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
         for (const auto &c : compound_params) {
           TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
@@ -1883,7 +1890,8 @@ class AV1ConvolveXHighbdCompoundTest
  public:
   void RunTest() {
     auto compound_params = GetCompoundParams();
-    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+    // Do not test the no-op filter.
+    for (int sub_pix = 1; sub_pix < 16; ++sub_pix) {
       for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
         for (const auto &c : compound_params) {
           TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
@@ -2282,8 +2290,9 @@ class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
     auto compound_params = GetCompoundParams();
     for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
       for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
-        for (int sub_x = 0; sub_x < 16; ++sub_x) {
-          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        // Do not test the no-op filter.
+        for (int sub_x = 1; sub_x < 16; ++sub_x) {
+          for (int sub_y = 1; sub_y < 16; ++sub_y) {
             for (const auto &compound : compound_params) {
               TestConvolve(static_cast<InterpFilter>(h_f),
                            static_cast<InterpFilter>(v_f), sub_x, sub_y,
@@ -2388,8 +2397,9 @@ class AV1Convolve2DHighbdCompoundTest
     auto compound_params = GetCompoundParams();
     for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
       for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
-        for (int sub_x = 0; sub_x < 16; ++sub_x) {
-          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        // Do not test the no-op filter.
+        for (int sub_x = 1; sub_x < 16; ++sub_x) {
+          for (int sub_y = 1; sub_y < 16; ++sub_y) {
             for (const auto &compound : compound_params) {
               TestConvolve(static_cast<InterpFilter>(h_f),
                            static_cast<InterpFilter>(v_f), sub_x, sub_y,
-- 
GitLab


From b21d9b0ed94b204d9cb55c9a0cf465b8a43d0d70 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 May 2024 10:06:29 +0100
Subject: [PATCH 131/391] Remove no longer needed special case in Neon
 convolutions

The neon_dotprod and neon_i8mm implementations of some convolution
functions have a special case to handle the no-op filter, as 128 does
not fit in a signed 8-bit integer. This no-op filter is never used in
practice - the unit tests have been updated to reflect this in a
previous patch - so remove the code to handle the no-op filter.

Change-Id: Ifd43a4f752a6cb9de8bbfbe7b4577764a4208d7e
---
 av1/common/arm/convolve_neon_dotprod.c | 642 ++++++++-----------
 av1/common/arm/convolve_neon_i8mm.c    | 845 +++++++++++--------------
 2 files changed, 645 insertions(+), 842 deletions(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index b558744731..9c50890999 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -105,6 +105,9 @@ static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
 static INLINE void convolve_x_sr_12tap_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr) {
+  // The no-op filter should never be used here.
+  assert(x_filter_ptr[5] != 128);
+
   const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
   const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
@@ -120,89 +123,60 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   const uint8x16_t range_limit = vdupq_n_u8(128);
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    // Undo the horizontal offset in the calling function.
-    src += 5;
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 =
+          convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+
+      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
 
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
     do {
       const uint8_t *s = src;
       uint8_t *d = dst;
       int width = w;
 
       do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        uint8x8_t d0 =
+            convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d1 =
+            convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d2 =
+            convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d3 =
+            convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+
+        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 =
-            convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
-        int16x4_t d1 =
-            convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
-        int16x4_t d2 =
-            convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
-        int16x4_t d3 =
-            convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
-
-        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
-        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
-
-        store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          uint8x8_t d0 =
-              convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d1 =
-              convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d2 =
-              convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d3 =
-              convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
-
-          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    }
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -520,44 +494,100 @@ static INLINE uint8x8_t convolve12_8_y(
 static INLINE void convolve_y_sr_12tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  // The no-op filter should never be used here.
+  assert(y_filter_ptr[5] != 128);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+    load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
+                 &t8, &t9, &tA);
+    src_ptr += 11 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+    int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+    int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+    int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+    int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
+
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, &s789A);
 
     do {
-      const uint8_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-      int width = w;
+      uint8x8_t tB, tC, tD, tE;
+      load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
 
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+      int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
+      int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
+      int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
+      int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
 
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
-    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+      int8x16_t s89AB, s9ABC, sABCD, sBCDE;
+      transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s789A, sBCDE } };
+      s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 =
+          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+      int16x4_t d1 =
+          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+      int16x4_t d2 =
+          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+      int16x4_t d3 =
+          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
 
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s789A;
+      s4567 = s89AB;
+      s5678 = s9ABC;
+      s6789 = sABCD;
+      s789A = sBCDE;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
 
-    if (w == 4) {
       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
-      load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
-                   &t8, &t9, &tA);
-      src_ptr += 11 * src_stride;
+      load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+                   &t9, &tA);
+      s += 11 * src_stride;
 
       // Transform sample range to [-128, 127] for 8-bit signed dot product.
       int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
@@ -572,169 +602,87 @@ static INLINE void convolve_y_sr_12tap_neon_dotprod(
       int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
       int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
 
-      int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
-      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
-      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
-      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
-      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+          s6789_hi, s789A_lo, s789A_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+      transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
 
       do {
         uint8x8_t tB, tC, tD, tE;
-        load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
+        load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
 
         int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
         int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
         int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
         int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
 
-        int8x16_t s89AB, s9ABC, sABCD, sBCDE;
-        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+        int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+            sBCDE_lo, sBCDE_hi;
+        transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
 
         // Merge new data into block from previous iteration.
-        int8x16x2_t samples_LUT = { { s789A, sBCDE } };
-        s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        int16x4_t d0 =
-            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
-        int16x4_t d1 =
-            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
-        int16x4_t d2 =
-            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
-        int16x4_t d3 =
-            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+        s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
+        sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+        int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+        s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
+        sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                           s89AB_hi, filter_0_7, filter_4_11);
+        uint8x8_t d1 =
+            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                           s9ABC_hi, filter_0_7, filter_4_11);
+        uint8x8_t d2 =
+            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                           sABCD_hi, filter_0_7, filter_4_11);
+        uint8x8_t d3 =
+            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                           sBCDE_hi, filter_0_7, filter_4_11);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s789A;
-        s4567 = s89AB;
-        s5678 = s9ABC;
-        s6789 = sABCD;
-        s789A = sBCDE;
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
-        load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
-                     &t9, &tA);
-        s += 11 * src_stride;
-
-        // Transform sample range to [-128, 127] for 8-bit signed dot product.
-        int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
-        int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
-        int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
-        int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
-        int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
-        int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
-        int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
-        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
-        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
-        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
-        int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
-            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
-        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
-        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
-        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
-
-        do {
-          uint8x8_t tB, tC, tD, tE;
-          load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
-
-          int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
-          int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
-          int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
-          int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
-
-          int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
-              sBCDE_lo, sBCDE_hi;
-          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
-
-          // Merge new data into block from previous iteration.
-          int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
-          s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
-          sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
-          s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
-          sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
-                             s89AB_hi, filter_0_7, filter_4_11);
-          uint8x8_t d1 =
-              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
-                             s9ABC_hi, filter_0_7, filter_4_11);
-          uint8x8_t d2 =
-              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
-                             sABCD_hi, filter_0_7, filter_4_11);
-          uint8x8_t d3 =
-              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
-                             sBCDE_hi, filter_0_7, filter_4_11);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s789A_lo;
-          s3456_hi = s789A_hi;
-          s4567_lo = s89AB_lo;
-          s4567_hi = s89AB_hi;
-          s5678_lo = s9ABC_lo;
-          s5678_hi = s9ABC_hi;
-          s6789_lo = sABCD_lo;
-          s6789_hi = sABCD_hi;
-          s789A_lo = sBCDE_lo;
-          s789A_hi = sBCDE_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s789A_lo;
+        s3456_hi = s789A_hi;
+        s4567_lo = s89AB_lo;
+        s4567_hi = s89AB_hi;
+        s5678_lo = s9ABC_lo;
+        s5678_hi = s9ABC_hi;
+        s6789_lo = sABCD_lo;
+        s6789_hi = sABCD_hi;
+        s789A_lo = sBCDE_lo;
+        s789A_hi = sBCDE_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -1026,139 +974,109 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
   const int bd = 8;
 
-  // Special case the following no-op filter as 128 won't fit into the 8-bit
-  // signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
 
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Dot product constants.
+  const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  if (w <= 4) {
     do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
 
-      do {
-        uint8x8_t s0 = vld1_u8(s);
-        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
-        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
-        // Store 8 elements to avoid additional branches. This is safe if the
-        // actual block width is < 8 because the intermediate buffer is large
-        // enough to accommodate 128x128 blocks.
-        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve12_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve12_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve12_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      vst1_s16(dst_ptr, d0);
 
-        d += 8;
-        s += 8;
-        width -= 8;
-      } while (width > 0);
       src_ptr += src_stride;
       dst_ptr += dst_stride;
     } while (--h != 0);
 
   } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-
-    // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    const int32_t horiz_const =
-        ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    // Dot product constants.
-    const int32x4_t correction =
-        vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
-    const uint8x16_t range_limit = vdupq_n_u8(128);
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-
-    if (w <= 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
       do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
-        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
                                          permute_tbl);
-        int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit,
+        int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, range_limit,
                                          permute_tbl);
-        int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit,
+        int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, range_limit,
                                          permute_tbl);
-        int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit,
+        int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, range_limit,
                                          permute_tbl);
 
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-      do {
-        uint8x16_t s0 = vld1q_u8(src_ptr);
-        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
-                                         permute_tbl);
-        vst1_s16(dst_ptr, d0);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
 
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
 
-    } else {
       do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction,
-                                           range_limit, permute_tbl);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
+                                         permute_tbl);
+        vst1q_s16(d, d0);
 
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
-                                           range_limit, permute_tbl);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-    }
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
   }
 }
 
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index b2f489f0d4..7ba8b6664e 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -95,94 +95,68 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
                                                  int src_stride, uint8_t *dst,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *x_filter_ptr) {
+  // The no-op filter should never be used here.
+  assert(x_filter_ptr[5] != 128);
+
   const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
   const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
   const int8x16_t filter =
       vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    // Undo the horizontal offset in the calling function.
-    src += 5;
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
+  const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
+      int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
+
+      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
     do {
       const uint8_t *s = src;
       uint8_t *d = dst;
       int width = w;
 
       do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
+        uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
+        uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
+        uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
+
+        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-    // right shift by FILTER_BITS - instead of a first rounding right shift by
-    // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-    // ROUND0_BITS.
-    const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
-        int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
-        int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
-        int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
-
-        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
-        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
-
-        store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
-          uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
-          uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
-          uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
-
-          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    }
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -470,187 +444,161 @@ static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  // The no-op filter should never be used here.
+  assert(y_filter_ptr[5] != 128);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+                 &s8, &s9, &sA);
+    src_ptr += 11 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, &s789A);
 
     do {
+      uint8x8_t sB, sC, sD, sE;
+      load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
+
+      uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
+      transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
+      s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 =
+          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+      int16x4_t d1 =
+          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+      int16x4_t d2 =
+          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+      int16x4_t d3 =
+          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s789A;
+      s4567 = s89AB;
+      s5678 = s9ABC;
+      s6789 = sABCD;
+      s789A = sBCDE;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
       const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
-    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
-
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
 
-    if (w == 4) {
       uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
-      load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
-                   &s8, &s9, &sA);
-      src_ptr += 11 * src_stride;
-
-      // This operation combines a conventional transpose and the sample permute
-      // (see horizontal case) required before computing the dot product.
-      uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
-      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
-      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
-      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
-      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+      load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                   &s9, &sA);
+      s += 11 * src_stride;
+
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+          s6789_hi, s789A_lo, s789A_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+      transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
 
       do {
         uint8x8_t sB, sC, sD, sE;
-        load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
+        load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
 
-        uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
-        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+        uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+            sBCDE_lo, sBCDE_hi;
+        transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
 
         // Merge new data into block from previous iteration.
-        uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
-        s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        int16x4_t d0 =
-            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
-        int16x4_t d1 =
-            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
-        int16x4_t d2 =
-            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
-        int16x4_t d3 =
-            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+        s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+        sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+        uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+        s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+        sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                           s89AB_hi, filter_0_7, filter_4_11);
+        uint8x8_t d1 =
+            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                           s9ABC_hi, filter_0_7, filter_4_11);
+        uint8x8_t d2 =
+            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                           sABCD_hi, filter_0_7, filter_4_11);
+        uint8x8_t d3 =
+            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                           sBCDE_hi, filter_0_7, filter_4_11);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s789A;
-        s4567 = s89AB;
-        s5678 = s9ABC;
-        s6789 = sABCD;
-        s789A = sBCDE;
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
-        load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
-                     &s9, &sA);
-        s += 11 * src_stride;
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
-            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
-        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
-        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
-        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
-
-        do {
-          uint8x8_t sB, sC, sD, sE;
-          load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
-
-          uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
-              sBCDE_lo, sBCDE_hi;
-          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
-
-          // Merge new data into block from previous iteration.
-          uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
-          s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
-          sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
-          s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
-          sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
-                             s89AB_hi, filter_0_7, filter_4_11);
-          uint8x8_t d1 =
-              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
-                             s9ABC_hi, filter_0_7, filter_4_11);
-          uint8x8_t d2 =
-              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
-                             sABCD_hi, filter_0_7, filter_4_11);
-          uint8x8_t d3 =
-              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
-                             sBCDE_hi, filter_0_7, filter_4_11);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s789A_lo;
-          s3456_hi = s789A_hi;
-          s4567_lo = s89AB_lo;
-          s4567_hi = s89AB_hi;
-          s5678_lo = s9ABC_lo;
-          s5678_hi = s9ABC_hi;
-          s6789_lo = sABCD_lo;
-          s6789_hi = sABCD_hi;
-          s789A_lo = sBCDE_lo;
-          s789A_hi = sBCDE_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s789A_lo;
+        s3456_hi = s789A_hi;
+        s4567_lo = s89AB_lo;
+        s4567_hi = s89AB_hi;
+        s5678_lo = s9ABC_lo;
+        s5678_hi = s9ABC_hi;
+        s6789_lo = sABCD_lo;
+        s6789_hi = sABCD_hi;
+        s789A_lo = sBCDE_lo;
+        s789A_hi = sBCDE_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -684,155 +632,126 @@ static INLINE void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
                                                 uint8_t *dst_ptr,
                                                 int dst_stride, int w, int h,
                                                 const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
 
-    do {
-      const uint8_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-      int width = w;
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
 
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+    load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
 
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+    do {
+      uint8x8_t s7, s8, s9, s10;
+      load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      uint8x16_t s4567, s5678, s6789, s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
 
-    if (w == 4) {
       uint8x8_t s0, s1, s2, s3, s4, s5, s6;
-      load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      src_ptr += 7 * src_stride;
-
-      // This operation combines a conventional transpose and the sample permute
-      // (see horizontal case) required before computing the dot product.
-      uint8x16_t s0123, s1234, s2345, s3456;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t s7, s8, s9, s10;
-        load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        uint8x16_t s4567, s5678, s6789, s78910;
-        transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+        uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
+            s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
         // Merge new data into block from previous iteration.
-        uint8x16x2_t samples_LUT = { { s3456, s78910 } };
-        s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+        s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+        uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+        s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
 
-        int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
-        int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
-        int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
-        int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s78910;
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t s0, s1, s2, s3, s4, s5, s6;
-        load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-        s += 7 * src_stride;
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-
-        do {
-          uint8x8_t s7, s8, s9, s10;
-          load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-          uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
-              s78910_lo, s78910_hi;
-          transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
-
-          // Merge new data into block from previous iteration.
-          uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
-          s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
-          s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
-          s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
-          s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
-          uint8x8_t d1 =
-              convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
-          uint8x8_t d2 =
-              convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
-          uint8x8_t d3 =
-              convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s78910_lo;
-          s3456_hi = s78910_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -928,135 +847,101 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
   const int bd = 8;
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+  // - which are generally faster than rounding shifts on modern CPUs.
+  const int32x4_t horiz_const =
+      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
+  if (w <= 4) {
     do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
 
-      do {
-        uint8x8_t s0 = vld1_u8(s);
-        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
-        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
-        // Store 8 elements to avoid additional branches. This is safe if the
-        // actual block width is < 8 because the intermediate buffer is large
-        // enough to accommodate 128x128 blocks.
-        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1_s16(dst_ptr, d0);
 
-        d += 8;
-        s += 8;
-        width -= 8;
-      } while (width > 0);
       src_ptr += src_stride;
       dst_ptr += dst_stride;
     } while (--h != 0);
 
   } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-    // - which are generally faster than rounding shifts on modern CPUs.
-    const int32x4_t horiz_const =
-        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
 
-        int16x4_t d0 =
-            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x4_t d1 =
-            convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x4_t d2 =
-            convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x4_t d3 =
-            convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 =
+            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 =
+            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 =
+            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
 
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
 
-      do {
-        uint8x16_t s0 = vld1q_u8(src_ptr);
-        int16x4_t d0 =
-            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        vst1_s16(dst_ptr, d0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
 
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
 
-    } else {
       do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          int16x8_t d0 =
-              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-          int16x8_t d1 =
-              convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-          int16x8_t d2 =
-              convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-          int16x8_t d3 =
-              convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1q_s16(d, d0);
 
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-          int16x8_t d0 =
-              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-    }
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
   }
 }
 
-- 
GitLab


From bad168356961c7afb87c35b7689296fe36ae168c Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 9 May 2024 14:52:43 +0000
Subject: [PATCH 132/391] dr_prediction_test.cc: Catch over-reads in zN
 predictors

The existing test setup does not allow us to catch code reading outside
the bounds of the input left/above arrays since the inputs are
statically allocated as part of a larger object.

To get around this, switch to creating a fresh vector of the correct
size as part of the test itself, mirroring the same approach previously
taken for the destination array.

Bug: aomedia:3571
Change-Id: If375c1d00e10ae910c878e70e801b6a49b4f329e
---
 test/dr_prediction_test.cc | 58 ++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index c23b08e481..50d5320e8a 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -29,6 +29,9 @@
 
 namespace {
 
+const int kNumIntraNeighbourPixels = MAX_TX_SIZE * 2 + 32;
+const int kIntraPredInputPadding = 16;
+
 const int kZ1Start = 0;
 const int kZ2Start = 90;
 const int kZ3Start = 180;
@@ -151,8 +154,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
  protected:
   static const int kMaxNumTests = 10000;
   static const int kIterations = 10;
-  static const int kOffset = 16;
-  static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
 
   DrPredTest()
       : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0),
@@ -160,20 +161,12 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     params_ = this->GetParam();
     start_angle_ = params_.start_angle;
     stop_angle_ = start_angle_ + 90;
-
-    above_ = &above_data_[kOffset];
-    left_ = &left_data_[kOffset];
-
-    for (int i = 0; i < kBufSize; ++i) {
-      above_data_[i] = rng_.Rand8();
-      left_data_[i] = rng_.Rand8();
-    }
   }
 
   ~DrPredTest() override = default;
 
-  void Predict(bool speedtest, int tx, Pixel *dst_ref, Pixel *dst_tst,
-               int dst_stride) {
+  void Predict(bool speedtest, int tx, const Pixel *above, const Pixel *left,
+               Pixel *dst_ref, Pixel *dst_tst, int dst_stride) {
     const int kNumTests = speedtest ? kMaxNumTests : 1;
     aom_usec_timer timer;
     int tst_time = 0;
@@ -182,7 +175,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
 
     aom_usec_timer_start(&timer);
     for (int k = 0; k < kNumTests; ++k) {
-      params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above_, left_,
+      params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above, left,
                      upsample_above_, upsample_left_, dx_, dy_, bd_);
     }
     aom_usec_timer_mark(&timer);
@@ -192,7 +185,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
       aom_usec_timer_start(&timer);
       for (int k = 0; k < kNumTests; ++k) {
         API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst, dst_stride, bw_, bh_,
-                                                above_, left_, upsample_above_,
+                                                above, left, upsample_above_,
                                                 upsample_left_, dx_, dy_, bd_));
       }
       aom_usec_timer_mark(&timer);
@@ -211,11 +204,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
   void RunTest(bool speedtest, bool needsaturation, int p_angle) {
     bd_ = params_.bit_depth;
 
-    if (needsaturation) {
-      for (int i = 0; i < kBufSize; ++i) {
-        above_data_[i] = left_data_[i] = (1 << bd_) - 1;
-      }
-    }
     for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
       bw_ = tx_size_wide[kTxSize[tx]];
       bh_ = tx_size_high[kTxSize[tx]];
@@ -229,6 +217,28 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
         upsample_above_ = upsample_left_ = 0;
       }
 
+      // Declare input buffers as local arrays to allow checking for
+      // over-reads.
+      DECLARE_ALIGNED(16, Pixel, left_data[kNumIntraNeighbourPixels]);
+      DECLARE_ALIGNED(16, Pixel, above_data[kNumIntraNeighbourPixels]);
+
+      // We need to allow reading some previous bytes from the input pointers.
+      const Pixel *above = &above_data[kIntraPredInputPadding];
+      const Pixel *left = &left_data[kIntraPredInputPadding];
+
+      if (needsaturation) {
+        const Pixel sat = (1 << bd_) - 1;
+        for (int i = 0; i < kNumIntraNeighbourPixels; ++i) {
+          left_data[i] = sat;
+          above_data[i] = sat;
+        }
+      } else {
+        for (int i = 0; i < kNumIntraNeighbourPixels; ++i) {
+          left_data[i] = rng_.Rand8();
+          above_data[i] = rng_.Rand8();
+        }
+      }
+
       // Add additional padding to allow detection of over reads/writes when
       // the transform width is equal to MAX_TX_SIZE.
       const int dst_stride = MAX_TX_SIZE + 16;
@@ -242,7 +252,8 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
                                   (dst_stride - bw_) * sizeof(Pixel));
       }
 
-      Predict(speedtest, tx, dst_ref.data(), dst_tst.data(), dst_stride);
+      Predict(speedtest, tx, above, left, dst_ref.data(), dst_tst.data(),
+              dst_stride);
 
       for (int r = 0; r < bh_; ++r) {
         ASAN_UNPOISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_],
@@ -294,13 +305,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     }
   }
 
-  Pixel left_data_[kBufSize];
-  Pixel dummy_data_[kBufSize];
-  Pixel above_data_[kBufSize];
-
-  Pixel *above_;
-  Pixel *left_;
-
   int enable_upsample_;
   int upsample_above_;
   int upsample_left_;
-- 
GitLab


From 1ed584b2c6d51adba7a730e10a0dd2fb4ce997de Mon Sep 17 00:00:00 2001
From: Samuthirika S <samuthirika.s@ittiam.com>
Date: Mon, 13 May 2024 17:39:46 +0530
Subject: [PATCH 133/391] Add SSE2 for av1_resize_horz_dir()

This CL adds SSE2 implementation for av1_resize_horz_dir()
function. Also, unit test for the same is added.

This is a bit-exact change.

Change-Id: Ia2da5221913743f34519951235bbfa36aa8465e4
---
 av1/common/av1_rtcd_defs.pl  |   2 +-
 av1/common/x86/resize_avx2.c |   7 +-
 av1/common/x86/resize_sse2.c | 166 +++++++++++++++++++++++++++++++++++
 test/frame_resize_test.cc    |   7 ++
 4 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index eca260cce5..8e24bb9c1b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,7 +558,7 @@ add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int o
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-specialize qw/av1_resize_horz_dir avx2/;
+specialize qw/av1_resize_horz_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 38bbc2626d..7c36fca8a4 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -530,11 +530,10 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
                               uint8_t *intbuf, int height, int filtered_length,
                               int width2) {
   assert(height % 2 == 0);
-  // Currently, Invoking C function for width less than 32. Optimize the below,
-  // by invoking SSE2 once the implementation for the same is available.
+  // Invoke SSE2 for width less than 32.
   if (filtered_length < 32) {
-    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
-                          width2);
+    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
+                             width2);
     return;
   }
 
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index f0470a3f3c..1afc962216 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -164,3 +164,169 @@ bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
 
   return true;
 }
+
+// Blends a and b using mask and returns the result.
+static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
+  const __m128i masked_b = _mm_and_si128(mask, b);
+  const __m128i masked_a = _mm_andnot_si128(mask, a);
+  return (_mm_or_si128(masked_a, masked_b));
+}
+
+// Masks used for width 16 pixels, with left and right padding
+// requirements.
+static const uint8_t left_padding_mask[16] = {
+  255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const uint8_t right_padding_mask[16] = { 0,   0,   0,   0,  0,   0,
+                                                0,   0,   0,   0,  255, 255,
+                                                255, 255, 255, 255 };
+
+static const uint8_t mask_16[16] = {
+  255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
+};
+
+void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Invoke C for width less than 16.
+  if (filtered_length < 16) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+  }
+
+  __m128i coeffs_x[2];
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const int remain_col = filtered_length % 16;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
+  const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
+  const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  for (int i = 0; i < height; ++i) {
+    int filter_offset = 0;
+    for (int j = 0; j <= filtered_length - 16; j += 16) {
+      const int in_idx = i * in_stride + j - filter_offset;
+      const int out_idx = i * dst_stride + j / 2;
+
+      // a0 a1 a2 a3 .... a15
+      __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+      // a8 a9 a10 a11 .... a23
+      __m128i row01 =
+          _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+      filter_offset = 3;
+
+      // Pad start pixels to the left, while processing the first pixels in the
+      // row.
+      if (j == 0) {
+        const __m128i start_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride]);
+        row00 =
+            blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
+      }
+
+      // Pad end pixels to the right, while processing the last pixels in the
+      // row.
+      const int is_last_cols16 = (j == filtered_length - 16);
+      if (is_last_cols16) {
+        const __m128i end_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
+        row01 = blend(row01, end_pixel_row0, end_pad_mask);
+      }
+
+      // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
+      const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
+                                                _mm_srli_si128(row01, 2));
+      // a4 a5 a6 a7 a9 10 a11 a12 .... a19
+      const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
+                                                _mm_srli_si128(row01, 4));
+      // a6 a7 a8 a9 a10 a11 a12 a13 .... a21
+      const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
+                                                _mm_srli_si128(row01, 6));
+
+      // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
+      const __m128i s0 = _mm_and_si128(row00, mask_even);
+      // a1 a3 a5 a7 a9 a11 a13 a15
+      const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
+      // a2 a4 a6 a8 a10 a12 a14 a16
+      const __m128i s2 = _mm_and_si128(row0_1, mask_even);
+      // a3 a5 a7 a9 a11 a13 a15 a17
+      const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
+      // a4 a6 a8 a10 a12 a14 a16 a18
+      const __m128i s4 = _mm_and_si128(row0_2, mask_even);
+      // a5 a7 a9 a11 a13 a15 a17 a19
+      const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
+      // a6 a8 a10 a12 a14 a16 a18 a20
+      const __m128i s6 = _mm_and_si128(row0_3, mask_even);
+      // a7 a9 a11 a13 a15 a17 a19 a21
+      const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
+
+      // a0a7 a2a9 a4a11 .... a12a19 a14a21
+      const __m128i s07 = _mm_add_epi16(s0, s7);
+      // a1a6 a3a8 a5a10 .... a13a18 a15a20
+      const __m128i s16 = _mm_add_epi16(s1, s6);
+      // a2a5 a4a7 a6a9  .... a14a17 a16a19
+      const __m128i s25 = _mm_add_epi16(s2, s5);
+      // a3a4 a5a6 a7a8  .... a15a16 a17a18
+      const __m128i s34 = _mm_add_epi16(s3, s4);
+
+      // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
+      const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
+      // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
+      const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
+
+      // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
+      const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
+      // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
+      const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
+
+      const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
+      const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
+      const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
+      const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
+
+      // Result of first 8 pixels of row0 (a0 to a7).
+      // r0_0 r0_1 r0_2 r0_3
+      __m128i r00 = _mm_add_epi32(r01_0, r01_1);
+      r00 = _mm_add_epi32(r00, round_const_bits);
+      r00 = _mm_sra_epi32(r00, round_shift_bits);
+
+      // Result of next 8 pixels of row0 (a8 to 15).
+      // r0_4 r0_5 r0_6 r0_7
+      __m128i r01 = _mm_add_epi32(r01_2, r01_3);
+      r01 = _mm_add_epi32(r01, round_const_bits);
+      r01 = _mm_sra_epi32(r01, round_shift_bits);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      const __m128i res_16 = _mm_packs_epi32(r00, r01);
+      const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
+      __m128i res = _mm_min_epu8(res_8, clip_pixel);
+      res = _mm_max_epu8(res, zero);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      _mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
+    }
+
+    int wd_processed = filtered_length - remain_col;
+    // When the remaining width is 2, the above code would not have taken
+    // care of padding required for (filtered_length - 4)th pixel. Hence,
+    // process that pixel again with the C code.
+    wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+    if (remain_col) {
+      const int in_idx = (in_stride * i);
+      const int out_idx = (wd_processed / 2) + width2 * i;
+
+      down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                    wd_processed);
+    }
+  }
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 7a4da45973..befdd490b4 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -245,6 +245,13 @@ TEST_P(AV1ResizeXTest, RunTest) { RunTest(); }
 
 TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1ResizeXTest,
+    ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ResizeXTest,
-- 
GitLab


From 0e050334767c53638f0e0862648c7133bc15b626 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 15 May 2024 03:18:56 +0000
Subject: [PATCH 134/391] rtc: Enable dct_only_palette_nonrd for speed >= 10

This change only affect the keyframe and allows for
only testing dct in palette mode for nonrd pickmode,
which gives some speedup with little loss in quality.

avg/ovr/ssim bdrate loss for CQ all-intra, speed 11 screen:
0.809/0.779/0.135
Instruction count reduction: ~23%

Change-Id: I3a0b720b70a18c4f522ec7cd33a5036847457c5a
---
 av1/encoder/speed_features.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6fd01d8fc1..73836276ff 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1579,6 +1579,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.nonrd_aggressive_skip = 1;
       sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
       sf->rt_sf.hybrid_intra_pickmode = 0;
+      sf->rt_sf.dct_only_palette_nonrd = 1;
       sf->rt_sf.prune_screen_palette_search = true;
       sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
-- 
GitLab


From 8f107273cc641d3d4c260df122a21399b7e46a93 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 15 May 2024 13:13:33 -0700
Subject: [PATCH 135/391] Add const to oxcf param of av1_check_fpmt_config()

Change-Id: I291e38626f7225fccdb1311325e517ec69b30f6f
---
 av1/encoder/ethread.c | 5 +++--
 av1/encoder/ethread.h | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 1d0092a5ed..356aa03275 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -1124,7 +1124,8 @@ void av1_terminate_workers(AV1_PRIMARY *ppi) {
 
 // This function returns 1 if frame parallel encode is supported for
 // the current configuration. Returns 0 otherwise.
-static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+static AOM_INLINE int is_fpmt_config(const AV1_PRIMARY *ppi,
+                                     const AV1EncoderConfig *oxcf) {
   // FPMT is enabled for AOM_Q and AOM_VBR.
   // TODO(Tarun): Test and enable resize config.
   if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
@@ -1162,7 +1163,7 @@ static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
 }
 
 int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
-                          AV1EncoderConfig *const oxcf) {
+                          const AV1EncoderConfig *const oxcf) {
   if (is_fpmt_config(ppi, oxcf)) return 1;
   // Reset frame parallel configuration for unsupported config
   if (ppi->num_fp_contexts > 1) {
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 468e120776..138811c8a3 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -122,7 +122,8 @@ int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
 
 int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
 
-int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf);
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
+                          const AV1EncoderConfig *const oxcf);
 
 void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
                                   AV1_COMP_DATA *const first_cpi_data);
-- 
GitLab


From 86d1aeba446f6b3a14c438136d2b1fbd6b1e7ae9 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 15 May 2024 05:59:35 +0000
Subject: [PATCH 136/391] rtc: Refactor speed features for prune palette

Two speed features sharing the similar name and
function, so combine into one.

Change-Id: Ie36a1e96a4e43358ce50c654363c852b3e14ba6a
---
 av1/encoder/nonrd_pickmode.c |  8 +++++---
 av1/encoder/speed_features.c |  7 +++----
 av1/encoder/speed_features.h | 12 ++++++------
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 7b204c78a7..45d81aa9d5 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1649,7 +1649,8 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     }
   }
 
-  const unsigned int thresh_sad = cpi->sf.rt_sf.prune_palette_nonrd ? 100 : 20;
+  const unsigned int thresh_sad =
+      cpi->sf.rt_sf.prune_palette_search_nonrd > 1 ? 100 : 20;
   const unsigned int best_sad_norm =
       args.best_sad >>
       (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
@@ -1659,7 +1660,7 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
       cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                         mi->bsize);
-  if (cpi->sf.rt_sf.prune_screen_palette_search) {
+  if (cpi->sf.rt_sf.prune_palette_search_nonrd > 0) {
     bool prune =
         (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) &&
         bsize <= BLOCK_16X16 && x->source_variance > 200;
@@ -3477,7 +3478,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
       (cpi->rc.high_source_sad || x->source_variance > 300);
 
-  if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
+  if (rt_sf->prune_palette_search_nonrd > 1 && bsize > BLOCK_16X16)
+    try_palette = 0;
 
   // Perform screen content mode evaluation for non-rd
   handle_screen_content_mode_nonrd(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 73836276ff..a788af811b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1580,14 +1580,14 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90;
       sf->rt_sf.hybrid_intra_pickmode = 0;
       sf->rt_sf.dct_only_palette_nonrd = 1;
-      sf->rt_sf.prune_screen_palette_search = true;
+      sf->rt_sf.prune_palette_search_nonrd = 1;
       sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
     }
     if (speed >= 11) {
       sf->rt_sf.skip_lf_screen = 2;
       sf->rt_sf.skip_cdef_sb = 2;
       sf->rt_sf.part_early_exit_zeromv = 2;
-      sf->rt_sf.prune_palette_nonrd = 1;
+      sf->rt_sf.prune_palette_search_nonrd = 2;
       sf->rt_sf.increase_color_thresh_palette = 0;
       sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
       sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
@@ -2255,7 +2255,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->use_nonrd_filter_search = 0;
   rt_sf->use_simple_rd_model = 0;
   rt_sf->hybrid_intra_pickmode = 0;
-  rt_sf->prune_screen_palette_search = false;
+  rt_sf->prune_palette_search_nonrd = 0;
   rt_sf->source_metrics_sb_nonrd = 0;
   rt_sf->overshoot_detection_cbr = NO_DETECTION;
   rt_sf->check_scene_detection = 0;
@@ -2282,7 +2282,6 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->use_rtc_tf = 0;
   rt_sf->use_idtx_nonrd = 0;
   rt_sf->prune_idtx_nonrd = 0;
-  rt_sf->prune_palette_nonrd = 0;
   rt_sf->dct_only_palette_nonrd = 0;
   rt_sf->part_early_exit_zeromv = 0;
   rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 300d7e190e..77e66228a8 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1631,9 +1631,12 @@ typedef struct REAL_TIME_SPEED_FEATURES {
 
   // Filter blocks by certain criteria such as SAD, source variance, such that
   // fewer blocks will go through the palette search.
-  // For screen content types, enable this feature reduces key frame encoding
-  // time. Disabling it leads to better compression efficiency.
-  bool prune_screen_palette_search;
+  // For nonrd encoding path, enable this feature reduces encoding time when
+  // palette mode is used. Disabling it leads to better compression efficiency.
+  // 0: off
+  // 1: less aggressive pruning mode
+  // 2: more aggressive pruning mode
+  int prune_palette_search_nonrd;
 
   // Compute variance/sse on source difference, prior to encoding superblock.
   int source_metrics_sb_nonrd;
@@ -1768,9 +1771,6 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // is not already set.
   int prune_idtx_nonrd;
 
-  // Prune the use of paletter mode in nonrd pickmode.
-  int prune_palette_nonrd;
-
   // Force to only use dct for palette search in nonrd pickmode.
   int dct_only_palette_nonrd;
 
-- 
GitLab


From e42f4b1980bbbc772aa886d8b43a885461d7b89e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 14 May 2024 17:54:10 -0700
Subject: [PATCH 137/391] update codec config after svc/scale controls

This ensures the encoder state/allocations stay in sync with scaling and
svc layer changes. In the SVC case, depending on the resolution,
differences in the chosen superblock size among layers may have caused a
crash. This was reproducible in WebRTC in screen content mode.

The fix is based on a change by Yuan Tong (tongyuan200097) [1]. It
refreshes the encoder config after AOME_SET_SCALEMODE,
AOME_SET_NUMBER_SPATIAL_LAYERS and AV1E_SET_SVC_PARAMS if no frames have
been encoded. AV1E_SET_SVC_PARAMS was missed in the original change.

[1]: https://aomedia-review.googlesource.com/c/aom/+/171941/2

Bug: chromium:339877165
Change-Id: Ib3d2a123b159898d7c7e19c81e89ff148920e1f1
---
 av1/av1_cx_iface.c | 129 +++++++++++++++++++++++++++++----------------
 1 file changed, 85 insertions(+), 44 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 39c03c9ecb..262d243cd2 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -1610,37 +1610,42 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) {
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
+  bool is_sb_size_changed = false;
+  av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+  for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+    AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
+    struct aom_internal_error_info *const error = cpi->common.error;
+    if (setjmp(error->jmp)) {
+      error->setjmp = 0;
+      return error->error_code;
+    }
+    error->setjmp = 1;
+    av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
+    error->setjmp = 0;
+  }
+  if (ctx->ppi->cpi_lap != NULL) {
+    AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
+    struct aom_internal_error_info *const error = cpi_lap->common.error;
+    if (setjmp(error->jmp)) {
+      error->setjmp = 0;
+      return error->error_code;
+    }
+    error->setjmp = 1;
+    av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
+    error->setjmp = 0;
+  }
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
                                         const struct av1_extracfg *extra_cfg) {
   const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
-    bool is_sb_size_changed = false;
-    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
-    for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
-      AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
-      struct aom_internal_error_info *const error = cpi->common.error;
-      if (setjmp(error->jmp)) {
-        error->setjmp = 0;
-        return error->error_code;
-      }
-      error->setjmp = 1;
-      av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
-      error->setjmp = 0;
-    }
-    if (ctx->ppi->cpi_lap != NULL) {
-      AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
-      struct aom_internal_error_info *const error = cpi_lap->common.error;
-      if (setjmp(error->jmp)) {
-        error->setjmp = 0;
-        return error->error_code;
-      }
-      error->setjmp = 1;
-      av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
-      error->setjmp = 0;
-    }
+    return update_encoder_cfg(ctx);
   }
   return res;
 }
@@ -3611,11 +3616,23 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
   aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
 
   if (mode) {
-    const int res = av1_set_internal_size(
-        &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
-        mode->h_scaling_mode, mode->v_scaling_mode);
-    av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
-    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+    AV1EncoderConfig *const oxcf =
+        ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf;
+    const int res =
+        av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params,
+                              mode->h_scaling_mode, mode->v_scaling_mode);
+    if (res == 0) {
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      if (ctx->ppi->seq_params_locked) {
+        av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+        return AOM_CODEC_OK;
+      }
+      return update_encoder_cfg(ctx);
+    }
+    return AOM_CODEC_INVALID_PARAM;
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -3636,6 +3653,13 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
   if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
   ctx->ppi->number_spatial_layers = number_spatial_layers;
+  // update_encoder_cfg() is somewhat costly and this control may be called
+  // multiple times, so update_encoder_cfg() is only called to ensure frame and
+  // superblock sizes are updated before they're fixed by the first encode
+  // call.
+  if (!ctx->ppi->seq_params_locked) {
+    return update_encoder_cfg(ctx);
+  }
   return AOM_CODEC_OK;
 }
 
@@ -3653,8 +3677,6 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   AV1_PRIMARY *const ppi = ctx->ppi;
   AV1_COMP *const cpi = ppi->cpi;
-  AV1_COMMON *const cm = &cpi->common;
-  AV1EncoderConfig *oxcf = &cpi->oxcf;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
   int64_t target_bandwidth = 0;
   ppi->number_spatial_layers = params->number_spatial_layers;
@@ -3694,19 +3716,38 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
           target_bandwidth += lc->layer_target_bitrate;
       }
     }
-    if (cm->current_frame.frame_number == 0) {
-      if (!cpi->ppi->seq_params_locked) {
-        SequenceHeader *const seq_params = &ppi->seq_params;
-        seq_params->operating_points_cnt_minus_1 =
-            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
-        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
-      }
+
+    if (ppi->seq_params_locked) {
+      AV1EncoderConfig *const oxcf = &cpi->oxcf;
+      // Keep ctx->oxcf in sync in case further codec controls are made prior
+      // to encoding.
+      ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth =
+          target_bandwidth;
+      set_primary_rc_buffer_sizes(oxcf, ppi);
+      av1_update_layer_context_change_config(cpi, target_bandwidth);
+      check_reset_rc_flag(cpi);
+    } else {
+      // Note av1_init_layer_context() relies on cpi->oxcf. The order of that
+      // call and the ones in the other half of this block (which
+      // update_encoder_cfg() transitively makes) is important. So we keep
+      // ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will
+      // overwrite cpi->oxcf with ctx->oxcf.
+      ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth =
+          target_bandwidth;
+      SequenceHeader *const seq_params = &ppi->seq_params;
+      seq_params->operating_points_cnt_minus_1 =
+          ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+
       av1_init_layer_context(cpi);
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      return update_encoder_cfg(ctx);
     }
-    oxcf->rc_cfg.target_bandwidth = target_bandwidth;
-    set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
-    av1_update_layer_context_change_config(cpi, target_bandwidth);
-    check_reset_rc_flag(cpi);
+  } else if (!ppi->seq_params_locked) {
+    // Ensure frame and superblock sizes are updated.
+    return update_encoder_cfg(ctx);
   }
   av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
   return AOM_CODEC_OK;
-- 
GitLab


From 0f766c1101fa146bfe2aeb7eca23e076bbf631de Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 May 2024 15:22:47 -0700
Subject: [PATCH 138/391] disable av1_resize_horz_dir_sse2

This causes failures in SSE2/AV1ResizeXTest using 32-bit valgrind:
==1546504== Invalid read of size 16
==1546504==    at 0x10C4D4F: _mm_loadu_si128 (emmintrin.h:703)
==1546504==    by 0x10C4D4F: av1_resize_horz_dir_sse2 (resize_sse2.c:225)

Bug: aomedia:3575
Change-Id: I0c4f887fbefdce44ba7a8a615c889354bc680f35
---
 av1/common/av1_rtcd_defs.pl  | 4 +++-
 av1/common/x86/resize_avx2.c | 8 +++++---
 test/frame_resize_test.cc    | 4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8e24bb9c1b..c57b6f0a5f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,7 +558,9 @@ add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int o
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-specialize qw/av1_resize_horz_dir sse2 avx2/;
+# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
+# passes under 32-bit valgrind.
+specialize qw/av1_resize_horz_dir avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 7c36fca8a4..425c9f44e1 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -530,10 +530,12 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
                               uint8_t *intbuf, int height, int filtered_length,
                               int width2) {
   assert(height % 2 == 0);
-  // Invoke SSE2 for width less than 32.
+  // Invoke C for width less than 32.
+  // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
+  // passes under 32-bit valgrind.
   if (filtered_length < 32) {
-    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
-                             width2);
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
     return;
   }
 
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index befdd490b4..83e56edefb 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -245,7 +245,9 @@ TEST_P(AV1ResizeXTest, RunTest) { RunTest(); }
 
 TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
 
-#if HAVE_SSE2
+// TODO(https://crbug.com/aomedia/3575): Reenable this after test passes under
+// 32-bit valgrind.
+#if 0  // HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1ResizeXTest,
     ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),
-- 
GitLab


From 01467cdbd524900eed283660836179fd1b2cd536 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 May 2024 13:44:52 -0700
Subject: [PATCH 139/391] encode_api_test: add repro for chromium 339877165

BUG=chromium:339877165

Change-Id: I69dcc2cda098ec96a34e1e5f7ef557ee8caf5521
---
 test/encode_api_test.cc | 141 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index a7d5b3aa3c..27bcbc14c1 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -635,6 +635,147 @@ TEST(EncodeAPI, PtsOrDurationTooBig) {
   aom_codec_destroy(&enc);
 }
 
+// Reproduces https://crbug.com/339877165.
+TEST(EncodeAPI, Buganizer339877165) {
+  // Initialize libaom encoder.
+  aom_codec_iface_t *const iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+            AOM_CODEC_OK);
+
+  cfg.g_w = 2560;
+  cfg.g_h = 1600;
+  cfg.rc_target_bitrate = 231;
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.g_threads = 8;
+
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  // From libaom_av1_encoder.cc in WebRTC.
+  ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 11), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_CDEF, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_TPL_MODEL, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_ORDER_HINT, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_AQ_MODE, 3), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AOME_SET_MAX_INTRA_BITRATE_PCT, 300),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_COEFF_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MODE_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MV_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_PALETTE, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 2), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_OBMC, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_NOISE_SENSITIVITY, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_WARPED_MOTION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_GLOBAL_MOTION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_REF_FRAME_MVS, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SUPERBLOCK_SIZE,
+                              AOM_SUPERBLOCK_SIZE_DYNAMIC),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_CFL_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_SMOOTH_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_ANGLE_DELTA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_FILTER_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DISABLE_TRELLIS_QUANT, 1),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DIST_WTD_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DIFF_WTD_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DUAL_FILTER, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTERINTRA_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTERINTRA_WEDGE, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTRA_EDGE_FILTER, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTRABC, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_MASKED_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_PAETH_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_QM, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_RECT_PARTITIONS, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_RESTORATION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_SMOOTH_INTERINTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_TX64, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MAX_REFERENCE_FRAMES, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK);
+
+  aom_svc_params_t svc_params = {};
+  svc_params.number_spatial_layers = 2;
+  svc_params.number_temporal_layers = 1;
+  svc_params.max_quantizers[0] = svc_params.max_quantizers[1] = 56;
+  svc_params.min_quantizers[0] = svc_params.min_quantizers[1] = 10;
+  svc_params.scaling_factor_num[0] = svc_params.scaling_factor_num[1] = 1;
+  svc_params.scaling_factor_den[0] = 2;
+  svc_params.scaling_factor_den[1] = 1;
+  svc_params.layer_target_bitrate[0] = cfg.rc_target_bitrate;
+  svc_params.framerate_factor[0] = 1;
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params),
+            AOM_CODEC_OK);
+
+  aom_svc_layer_id_t layer_id = {};
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id),
+            AOM_CODEC_OK);
+
+  aom_svc_ref_frame_config_t ref_frame_config = {};
+  ref_frame_config.refresh[0] = 1;
+  ASSERT_EQ(
+      aom_codec_control(&enc, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config),
+      AOM_CODEC_OK);
+
+  // Create input image.
+  aom_image_t *const image =
+      CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode layer 0.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+  layer_id.spatial_layer_id = 1;
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id),
+            AOM_CODEC_OK);
+
+  ref_frame_config.refresh[0] = 0;
+  ASSERT_EQ(
+      aom_codec_control(&enc, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config),
+      AOM_CODEC_OK);
+
+  // Encode layer 1.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+  // Free resources.
+  aom_img_free(image);
+  aom_codec_destroy(&enc);
+}
+
 class EncodeAPIParameterized
     : public testing::TestWithParam<std::tuple<
           /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
-- 
GitLab


From c66c41d99c9c3717dbe16c2285b743e66e04095e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 May 2024 15:47:53 -0700
Subject: [PATCH 140/391] Add return stmt after av1_resize_horz_dir_c() call

Otherwise we will fall through and execute code that assumes
filtered_length >= 16.

Bug: aomedia:3575
Change-Id: I88385f0d30fd6f9392b828d8bf716f7cc6f3b6b3
---
 av1/common/x86/resize_sse2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 1afc962216..6b34cebfe0 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -194,6 +194,7 @@ void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
   if (filtered_length < 16) {
     av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
                           width2);
+    return;
   }
 
   __m128i coeffs_x[2];
-- 
GitLab


From 00392c6223ac9aefc29e0d67929a943836ad8daf Mon Sep 17 00:00:00 2001
From: Yuan Tong <tongyuan200097@gmail.com>
Date: Wed, 8 Mar 2023 16:29:52 +0800
Subject: [PATCH 141/391] Update progressive test to catch more crash case

Update AVIFProgressiveTest.DimensionChangeLargeImageMultiThread to also catch the crash fixed by Ib3d2a123b159898d7c7e19c81e89ff148920e1f1.

BUG: aomedia:3382
Change-Id: I5a65578c7793fdac96c2d41cd71d63a75f7b0d1d
---
 test/avif_progressive_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index 2a28ca368b..59aebd486f 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -225,8 +225,6 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   aom_codec_ctx_t enc;
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 31));
-  EXPECT_EQ(AOM_CODEC_OK,
-            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AV1E_SET_ROW_MT, 1));  // MultiThread
@@ -234,6 +232,8 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
             aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
 
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
-- 
GitLab


From 6e3e2227eae988a7639d251d042c4fabb7db54d1 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 May 2024 14:19:09 -0700
Subject: [PATCH 142/391] Add the DimensionChangeBigImageMultiThread2 test

It is a variant of the DimensionChangeBigImageMultiThread test. The only
difference is that it doesn't have the spatial layers.

This test passes after James Zern's commit e42f4b1980:
https://aomedia-review.googlesource.com/c/aom/+/190181

Bug: aomedia:3382
Change-Id: Ic21e3a71645ac96ebda0b1f2bdcbf709b8f079d5
---
 test/avif_progressive_test.cc | 91 ++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index 59aebd486f..f3e2ef2af9 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -25,7 +25,7 @@ namespace {
 TEST(AVIFProgressiveTest, QualityChange) {
   constexpr int kWidth = 256;
   constexpr int kHeight = 256;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 3 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -110,7 +110,7 @@ TEST(AVIFProgressiveTest, QualityChange) {
 TEST(AVIFProgressiveTest, DimensionChange) {
   constexpr int kWidth = 256;
   constexpr int kHeight = 256;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 3 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -151,7 +151,7 @@ TEST(AVIFProgressiveTest, DimensionChange) {
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
-  aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
@@ -192,10 +192,10 @@ TEST(AVIFProgressiveTest, DimensionChange) {
 // This test reproduces bug aomedia:3382. Certain parameters such as width,
 // height, g_threads, usage, etc. were carefully chosen based on the
 // complicated logic of av1_select_sb_size() to cause an inconsistent sb_size.
-TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
+TEST(AVIFProgressiveTest, DimensionChangeBigImageMultiThread) {
   constexpr int kWidth = 1920;
   constexpr int kHeight = 1080;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 2 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -238,7 +238,7 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
-  aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
@@ -276,4 +276,83 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+// A variant of the previous test, without the spatial layers.
+TEST(AVIFProgressiveTest, DimensionChangeBigImageMultiThread2) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+  // A buffer of neutral gray samples.
+  constexpr size_t kBufferSize = 2 * kWidth * kHeight;
+  std::vector<unsigned char> buffer(kBufferSize,
+                                    static_cast<unsigned char>(128));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+  img.cp = AOM_CICP_CP_UNSPECIFIED;
+  img.tc = AOM_CICP_TC_UNSPECIFIED;
+  img.mc = AOM_CICP_MC_UNSPECIFIED;
+  img.range = AOM_CR_FULL_RANGE;
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+  cfg.g_profile = 0;
+  cfg.g_w = img.w;
+  cfg.g_h = img.h;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_lag_in_frames = 0;
+  cfg.g_threads = 2;  // MultiThread
+  cfg.rc_end_usage = AOM_Q;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 63;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 31));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_ROW_MT, 1));  // MultiThread
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // First frame
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Second frame
+  aom_enc_frame_flags_t encode_flags =
+      AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+      AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
 }  // namespace
-- 
GitLab


From e6dad63399fa65c0daf38d667d7f64b0e72df8e4 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 16 May 2024 10:51:15 +0100
Subject: [PATCH 143/391] Enable I8MM when compiling SVE2 files

The CPU feature detection logic assumes that if SVE2 is available then
all previous features are available as well (SVE, NEON_DOTPROD and
NEON_I8MM). Add 'i8mm' to the compiler flags for SVE2 files to reflect
these constraints. It's not necessary to add 'sve' or 'dotprod' as they
are implied by the 'armv9' flag.

Change-Id: I7e81b041910676af870eb0f45d3ed3a4014d31be
---
 build/cmake/cpu.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 8d0acf3d2b..6e6fdb8e24 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -19,7 +19,8 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
   set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod")
   set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm")
   set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
-  set(AOM_SVE2_DEFAULT_FLAG "-march=armv9-a+sve2") # SVE2 is a v9-only feature
+  set(AOM_SVE2_DEFAULT_FLAG "-march=armv9-a+i8mm+sve2") # SVE2 is a v9-only
+                                                        # feature
 
   # Check that the compiler flag to enable each flavor is supported by the
   # compiler. This may not be the case for new architecture features on old
-- 
GitLab


From 70818ace8d6d7be95681ff9d2fba93262b8e6482 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Wed, 15 May 2024 17:19:46 +0100
Subject: [PATCH 144/391] Add SVE2 implementation for 12-tap av1_convolve_2d_sr

The vertical pass of av1_convolve_2d_sr operates on 16-bit elements, so
we can use the SVE 16-bit dot-product to accelerate it. Given that the
presence of SVE2 implies the presence of Neon_I8MM, keep the i8mm
implementation for the horizontal pass. This gives an uplift of up to
25% compared to the full i8mm version.

Change-Id: I909ed998f83593ce9d0108c8e2a41f71c08a73de
---
 av1/av1.cmake                       |   3 +
 av1/common/arm/convolve_neon_i8mm.c | 164 +---------------------
 av1/common/arm/convolve_neon_i8mm.h | 189 ++++++++++++++++++++++++++
 av1/common/arm/convolve_sve2.c      | 203 ++++++++++++++++++++++++++++
 av1/common/av1_rtcd_defs.pl         |   2 +-
 test/av1_convolve_test.cc           |   5 +
 6 files changed, 402 insertions(+), 164 deletions(-)
 create mode 100644 av1/common/arm/convolve_neon_i8mm.h
 create mode 100644 av1/common/arm/convolve_sve2.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index dcc19b70d4..f1e9bc8049 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -415,6 +415,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SVE
             "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c"
             "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
 
+list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
+            "${AOM_ROOT}/av1/common/arm/convolve_sve2.c")
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
             "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
 
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 7ba8b6664e..60cdfdfc0a 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -18,15 +18,10 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/convolve_neon_i8mm.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
@@ -788,163 +783,6 @@ void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                y_filter_ptr);
 }
 
-static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
-                                          const int8x16_t filters,
-                                          const uint8x16x3_t permute_tbl,
-                                          int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum;
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  // First 4 output values.
-  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
-  // Narrow and re-pack.
-  return vshrn_n_s32(sum, ROUND0_BITS);
-}
-
-static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
-                                          const int8x16_t filters,
-                                          const uint8x16x3_t permute_tbl,
-                                          const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[4];
-  int32x4_t sum[2];
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
-  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
-  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
-
-  // First 4 output values.
-  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  // Second 4 output values.
-  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
-  // Narrow and re-pack.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
-                      vshrn_n_s32(sum[1], ROUND0_BITS));
-}
-
-static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
-    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11) {
-  // The no-op filter should never be used here.
-  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
-
-  const int bd = 8;
-
-  // Narrow filter values to 8-bit.
-  const int16x8x2_t x_filter_s16 = {
-    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-  };
-  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                         vmovn_s16(x_filter_s16.val[1]));
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-  // - which are generally faster than rounding shifts on modern CPUs.
-  const int32x4_t horiz_const =
-      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-
-  if (w <= 4) {
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-    } while (h > 4);
-
-    do {
-      uint8x16_t s0 = vld1q_u8(src_ptr);
-      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      vst1_s16(dst_ptr, d0);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-
-  } else {
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0[2], s1[2], s2[2], s3[2];
-        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-        int16x8_t d0 =
-            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x8_t d1 =
-            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x8_t d2 =
-            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x8_t d3 =
-            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-    } while (h > 4);
-
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0[2];
-        s0[0] = vld1q_u8(s);
-        s0[1] = vld1q_u8(s + 4);
-        int16x8_t d0 =
-            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        vst1q_s16(d, d0);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  }
-}
-
 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x3_t permute_tbl,
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h
new file mode 100644
index 0000000000..da55a2e796
--- /dev/null
+++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum;
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+  // Narrow and re-pack.
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[4];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+  // Second 4 output values.
+  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+  // Narrow and re-pack.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+                      vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
+  const int bd = 8;
+
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+  // - which are generally faster than rounding shifts on modern CPUs.
+  const int32x4_t horiz_const =
+      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 =
+            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 =
+            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 =
+            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
new file mode 100644
index 0000000000..136abae43c
--- /dev/null
+++ b/av1/common/arm/convolve_sve2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/highbd_convolve_sve2.h"
+#include "av1/common/arm/convolve_neon_i8mm.h"
+
+static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
+                                                 int16x8_t s1[2],
+                                                 int16x8_t s2[2],
+                                                 int16x8_t filter_0_7,
+                                                 int16x8_t filter_4_11) {
+  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
+  sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
+  sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
+  sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
+  sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
+
+  return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+}
+
+static INLINE void convolve_2d_sr_vert_12tap_sve2(
+    const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
+    const int16x8_t y_filter_4_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  uint16x8_t correction0 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
+  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
+
+  uint16x8_t correction1 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
+
+  uint16x8_t correction2 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
+  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
+
+  do {
+    int16_t *s = (int16_t *)src_ptr;
+    uint8_t *d = (uint8_t *)dst_ptr;
+    int height = h;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                  &s9, &sA);
+    s += 11 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
+        s6789[2], s789A[2];
+    // This operation combines a conventional transpose and the sample permute
+    // required before computing the dot product.
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+    do {
+      int16x4_t sB, sC, sD, sE;
+      load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+      int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
+      transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
+
+      // Merge new data into block from previous iteration.
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
+
+      int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7,
+                                              y_filter_4_11);
+
+      int16x8_t dd01 =
+          vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
+      int16x8_t dd23 =
+          vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+
+      dd01 = vsubq_s16(dd01, sub_const);
+      dd23 = vsubq_s16(dd23, sub_const);
+
+      uint8x8_t d01 = vqmovun_s16(dd01);
+      uint8x8_t d23 = vqmovun_s16(dd23);
+
+      store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+      s4567[0] = s89AB[0];
+      s4567[1] = s89AB[1];
+      s5678[0] = s9ABC[0];
+      s5678[1] = s9ABC[1];
+      s6789[0] = sABCD[0];
+      s6789[1] = sABCD[1];
+      s789A[0] = sBCDE[0];
+      s789A[1] = sBCDE[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    w -= 4;
+  } while (w != 0);
+}
+
+void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+
+  if (filter_params_x->taps > 8) {
+    const int im_h = h + filter_params_y->taps - 1;
+    const int im_stride = MAX_SB_SIZE;
+    const int vert_offset = filter_params_x->taps / 2 - 1;
+    const int horiz_offset = filter_params_x->taps / 2 - 1;
+    const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+    const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+    const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
+
+    convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                         im_stride, w, im_h, x_filter_0_7,
+                                         x_filter_8_11);
+
+    convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h,
+                                   y_filter_0_7, y_filter_4_11);
+  } else {
+    av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, filter_params_y, subpel_x_qn,
+                                 subpel_y_qn, conv_params);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c57b6f0a5f..b597e9a1d9 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -602,7 +602,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 
   add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
-  specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm sve2/;
   specialize qw/av1_convolve_2d_sr_intrabc neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_x_sr_intrabc neon/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 382a2fea63..2c630b7dbb 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -1331,6 +1331,11 @@ INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1Convolve2DTest,
                          BuildLowbdParams(av1_convolve_2d_sr_neon_i8mm));
 #endif
 
+#if HAVE_SVE2
+INSTANTIATE_TEST_SUITE_P(SVE2, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_sve2));
+#endif
+
 /////////////////////////////////////////////////////////////////
 // Single reference convolve-2D IntraBC functions (low bit-depth)
 /////////////////////////////////////////////////////////////////
-- 
GitLab


From 4637f5d7bb6d026c37a7530ceff9ce4f864c3e68 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 May 2024 19:19:33 -0700
Subject: [PATCH 145/391] set_encoder_config,cosmetics: collapse some blank
 lines

Change-Id: I8d1d9defc3119229a219de21c760fcaacb43c09c
---
 av1/av1_cx_iface.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 262d243cd2..c09a02c819 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -1034,39 +1034,22 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
   }
 
   TuneCfg *const tune_cfg = &oxcf->tune_cfg;
-
   FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
-
   TileConfig *const tile_cfg = &oxcf->tile_cfg;
-
   ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
-
   GFConfig *const gf_cfg = &oxcf->gf_cfg;
-
   PartitionCfg *const part_cfg = &oxcf->part_cfg;
-
   IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
-
   TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
-
   CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
-
   SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
-
   KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
-
   DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
-
   RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-
   QuantizationCfg *const q_cfg = &oxcf->q_cfg;
-
   ColorCfg *const color_cfg = &oxcf->color_cfg;
-
   InputCfg *const input_cfg = &oxcf->input_cfg;
-
   AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
-
   ToolCfg *const tool_cfg = &oxcf->tool_cfg;
 
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
-- 
GitLab


From 2cb4ddef5ef2cc726659f5fc43bdba7f9eb01850 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 14 May 2024 14:16:13 +0100
Subject: [PATCH 146/391] Tidy up constants and style in Neon av1_convolve_sr

Move some constants inside the convolution kernels and tidy up the style
to make it consistent across the different specialized paths.

Change-Id: I87ed3406a3aea760aa76dc7911b3cbb25e4eb4fd
---
 av1/common/arm/convolve_neon_dotprod.c | 351 +++++++++++--------------
 av1/common/arm/convolve_neon_i8mm.c    | 144 +++++-----
 av1/common/arm/convolve_neon_i8mm.h    |  48 ++--
 3 files changed, 239 insertions(+), 304 deletions(-)

diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 9c50890999..20da29880d 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -38,67 +38,75 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
 
 static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
-                                       const int32x4_t correction,
-                                       const uint8x16_t range_limit,
                                        const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum;
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Dot product constants:
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
+  int32x4_t acc =
+      vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
-  sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
-  sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+  int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
+  sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1);
+  sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2);
 
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
                                        const int8x16_t filter,
-                                       const int32x4_t correction,
-                                       const uint8x16_t range_limit,
                                        const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples[2], permuted_samples[4];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
-  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128[2] = {
+    vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
+    vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
+  };
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
   // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
-  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
-  // Second 4 output values.
-  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
+                                vqtbl1q_s8(samples_128[1],
+                                           permute_tbl.val[2]) };
+
+  // Dot product constants:
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
+  int32x4_t acc =
+      vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
+
+  int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
+
+  int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
 
   // Narrow and re-pack.
-  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
-                                   vqrshrn_n_s32(sum[1], FILTER_BITS));
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
+                                   vqrshrn_n_s32(sum4567, FILTER_BITS));
   return vqmovun_s16(sum_s16);
 }
 
@@ -114,13 +122,6 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   const int8x16_t filter =
       vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
 
-  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS.
-  int32x4_t correction =
-      vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
   if (w <= 4) {
@@ -128,14 +129,10 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t d0 =
-          convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
-      int16x4_t d1 =
-          convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
-      int16x4_t d2 =
-          convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
-      int16x4_t d3 =
-          convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+      int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl);
 
       uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
       uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
@@ -158,14 +155,10 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
         load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
         load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
-        uint8x8_t d0 =
-            convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
-        uint8x8_t d1 =
-            convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
-        uint8x8_t d2 =
-            convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
-        uint8x8_t d3 =
-            convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl);
 
         store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
 
@@ -196,7 +189,7 @@ static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
   // right shift by FILTER_BITS - instead of a first rounding right shift by
   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  // ROUND0_BITS. Halve the total because we halved the filter values.
   int32x4_t acc =
       vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
   int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
@@ -223,16 +216,15 @@ static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
   // right shift by FILTER_BITS - instead of a first rounding right shift by
   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  // ROUND0_BITS. Halve the total because we halved the filter values.
   int32x4_t acc =
       vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
-  // First 4 output values.
-  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
-  // Second 4 output values.
-  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
   // We halved the filter values so -1 from right shift.
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
@@ -298,33 +290,36 @@ static INLINE void convolve_x_sr_4tap_neon_dotprod(
 }
 
 static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
-                                      const int32x4_t correction,
-                                      const uint8x16_t range_limit,
                                       const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product. */
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Dot product constants:
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS. Halve the total because we halved the filter values.
+  int32x4_t acc =
+      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
+  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
-  // Second 4 output values.
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+  int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0);
+  sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
   // We halved the convolution filter values so - 1 from the right shift.
   return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
 }
@@ -361,14 +356,6 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
   }
 
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
-  // Dot product constants:
-  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we will halve the filter values.
-  const int32x4_t correction =
-      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
 
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
   // Filter values are even, so halve to reduce intermediate precision reqs.
@@ -383,14 +370,10 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-      uint8x8_t d0 =
-          convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
-      uint8x8_t d1 =
-          convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
-      uint8x8_t d2 =
-          convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
-      uint8x8_t d3 =
-          convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl);
 
       store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -906,28 +889,24 @@ void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
 
 static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
                                           const int8x16_t filters,
-                                          const int32x4_t correction,
-                                          const uint8x16_t range_limit,
+                                          const int32x4_t horiz_const,
                                           const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum;
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+  // Accumulate dot product into 'correction' to account for range transform.
+  int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
+  sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1);
+  sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2);
 
   // Narrow and re-pack.
   return vshrn_n_s32(sum, ROUND0_BITS);
@@ -936,38 +915,36 @@ static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
 static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                                           const int8x16_t filters,
                                           const int32x4_t correction,
-                                          const uint8x16_t range_limit,
                                           const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples[2], permuted_samples[4];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
-  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128[2] = {
+    vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
+    vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
+  };
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
   // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
-  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  // Second 4 output values.
-  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
+                                vqtbl1q_s8(samples_128[1],
+                                           permute_tbl.val[2]) };
+
+  // Accumulate dot product into 'correction' to account for range transform.
+  int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
+
+  int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
 
   // Narrow and re-pack.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
-                      vshrn_n_s32(sum[1], ROUND0_BITS));
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
+                      vshrn_n_s32(sum4567, ROUND0_BITS));
 }
 
 static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
@@ -992,7 +969,6 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
       ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   // Dot product constants.
   const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
   if (w <= 4) {
@@ -1000,14 +976,10 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t d0 =
-          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d1 =
-          convolve12_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d2 =
-          convolve12_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
-      int16x4_t d3 =
-          convolve12_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
+      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl);
+      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl);
+      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl);
 
       store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
 
@@ -1018,8 +990,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
 
     do {
       uint8x16_t s0 = vld1q_u8(src_ptr);
-      int16x4_t d0 =
-          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
       vst1_s16(dst_ptr, d0);
 
       src_ptr += src_stride;
@@ -1037,14 +1008,10 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
         load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
         load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
-        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, range_limit,
-                                         permute_tbl);
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
+        int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl);
+        int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl);
+        int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl);
 
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1066,8 +1033,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
         uint8x16_t s0[2];
         s0[0] = vld1q_u8(s);
         s0[1] = vld1q_u8(s + 4);
-        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
-                                         permute_tbl);
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
         vst1q_s16(d, d0);
 
         s += 8;
@@ -1137,7 +1103,7 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod(
   const int32_t horiz_const =
       ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   // Accumulate into 128 << FILTER_BITS to account for range transform.
-  // Halve the total because we will halve the filter values.
+  // Halve the total because we halved the filter values.
   const int32x4_t correction =
       vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
 
@@ -1218,50 +1184,47 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod(
 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
-                                         const uint8x16_t range_limit,
                                          const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
 
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  // Second 4 output values.
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+  // Accumulate dot product into 'correction' to account for range transform.
+  int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
+  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
+
+  int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
+  sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
 
   // Narrow and re-pack.
   // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
 static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
-  const int bd = 8;
-  // Dot product constants.
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+  const int bd = 8;
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   // shifts - which are generally faster than rounding shifts on modern CPUs.
   const int32_t horiz_const =
       ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-  // Halve the total because we will halve the filter values.
+  // Halve the total because we halved the filter values.
   const int32x4_t correction =
       vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
 
   const uint8_t *src_ptr = src;
   int16_t *dst_ptr = im_block;
@@ -1269,9 +1232,6 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
   int height = im_h;
 
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-  // Filter values are even, so halve to reduce intermediate precision reqs.
-  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-
   do {
     const uint8_t *s = src_ptr;
     int16_t *d = dst_ptr;
@@ -1281,14 +1241,10 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x8_t d0 =
-          convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
-      int16x8_t d1 =
-          convolve8_8_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
-      int16x8_t d2 =
-          convolve8_8_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
-      int16x8_t d3 =
-          convolve8_8_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
+      int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl);
+      int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl);
+      int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl);
 
       store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1308,8 +1264,7 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
 
     do {
       uint8x16_t s0 = vld1q_u8(s);
-      int16x8_t d0 =
-          convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
       vst1q_s16(d, d0);
 
       s += 8;
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 60cdfdfc0a..c3d4c94c77 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -35,21 +35,17 @@ static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl,
                                        const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum;
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
 
-  // First 4 output values.
-  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+  int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0);
+  sum = vusdotq_laneq_s32(sum, perm_samples[1], filter, 1);
+  sum = vusdotq_laneq_s32(sum, perm_samples[2], filter, 2);
 
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
@@ -58,31 +54,29 @@ static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl,
                                        const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[4];
-  int32x4_t sum[2];
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
   // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
-  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[2]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
 
-  // First 4 output values.
-  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
-  // Second 4 output values.
-  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+  int32x4_t sum0123 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
+
+  int32x4_t sum4567 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[1], filter, 0);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
 
   // Narrow and re-pack.
-  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
-                                   vqrshrn_n_s32(sum[1], FILTER_BITS));
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
+                                   vqrshrn_n_s32(sum4567, FILTER_BITS));
   return vqmovun_s16(sum_s16);
 }
 
@@ -160,15 +154,15 @@ static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
                                       const uint8x16_t permute_tbl) {
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
 
   // Dot product constants:
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
   // right shift by FILTER_BITS - instead of a first rounding right shift by
   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  // ROUND0_BITS. Halve the total because we halved the filter values.
   int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
-  int32x4_t sum = vusdotq_lane_s32(acc, permuted_samples, filters, 0);
+  int32x4_t sum = vusdotq_lane_s32(acc, perm_samples, filters, 0);
 
   // Further narrowing and packing is performed by the caller.
   return vmovn_s32(sum);
@@ -180,23 +174,21 @@ static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
   // Dot product constants:
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
   // right shift by FILTER_BITS - instead of a first rounding right shift by
   // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we will halve the filter values.
+  // ROUND0_BITS. Halve the total because we halved the filter values.
   int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
 
-  // First 4 output values.
-  int32x4_t sum0 = vusdotq_lane_s32(acc, permuted_samples[0], filters, 0);
-  // Second 4 output values.
-  int32x4_t sum1 = vusdotq_lane_s32(acc, permuted_samples[1], filters, 0);
+  int32x4_t sum0123 = vusdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  int32x4_t sum4567 = vusdotq_lane_s32(acc, perm_samples[1], filters, 0);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
   // We halved the filter values so -1 from right shift.
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
@@ -262,25 +254,21 @@ static INLINE void convolve_x_sr_4tap_neon_i8mm(
 static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
                                       const uint8x16x3_t permute_tbl,
                                       const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
 
-  // First 4 output values.
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
-  // Second 4 output values.
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+  int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
 
-  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
   // We halved the convolution filter values so - 1 from the right shift.
   return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
 }
@@ -316,17 +304,17 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
     return;
   }
 
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
   // rounding right shift by FILTER_BITS - instead of a first rounding right
   // shift by ROUND0_BITS, followed by second rounding right shift by
   // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we will halve the filter values.
+  // The outermost -1 is needed because we halved the filter values.
   const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
 
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-  // Filter values are even, so halve to reduce intermediate precision reqs.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
   do {
     const uint8_t *s = src;
     uint8_t *d = dst;
@@ -787,33 +775,34 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x3_t permute_tbl,
                                          const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
 
-  // First 4 output values.
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  // Second 4 output values.
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+  int32x4_t sum0123 =
+      vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
+
+  int32x4_t sum4567 =
+      vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
 
   // Narrow and re-pack.
   // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
 static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
   const int bd = 8;
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
   // shifts - which are generally faster than rounding shifts on modern CPUs.
@@ -827,9 +816,6 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm(
   int height = im_h;
 
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-  // Filter values are even, so halve to reduce intermediate precision reqs.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
   do {
     const uint8_t *s = src_ptr;
     int16_t *d = dst_ptr;
@@ -880,9 +866,9 @@ static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
                                          const int32x4_t horiz_const) {
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0);
 
   // We halved the convolution filter values so -1 from the right shift.
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
@@ -895,13 +881,13 @@ static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
   int32x4_t sum0123 =
-      vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+      vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
   int32x4_t sum4567 =
-      vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+      vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
 
   // Narrow and re-pack.
   // We halved the filter values so -1 from right shift.
@@ -920,7 +906,7 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
 
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // Halve the total because we will halve the filter values.
+  // Halve the total because we halved the filter values.
   const int32x4_t horiz_const = vdupq_n_s32(
       (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2));
 
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h
index da55a2e796..15a8a4e98c 100644
--- a/av1/common/arm/convolve_neon_i8mm.h
+++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -33,21 +33,17 @@ static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
                                           const int8x16_t filters,
                                           const uint8x16x3_t permute_tbl,
                                           int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum;
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
 
-  // First 4 output values.
-  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+  int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
+  sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1);
+  sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2);
 
   // Narrow and re-pack.
   return vshrn_n_s32(sum, ROUND0_BITS);
@@ -57,31 +53,29 @@ static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                                           const int8x16_t filters,
                                           const uint8x16x3_t permute_tbl,
                                           const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[4];
-  int32x4_t sum[2];
-
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
   // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
-  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[2]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
+
+  int32x4_t sum0123 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
 
-  // First 4 output values.
-  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  // Second 4 output values.
-  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+  int32x4_t sum4567 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
 
   // Narrow and re-pack.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
-                      vshrn_n_s32(sum[1], ROUND0_BITS));
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
+                      vshrn_n_s32(sum4567, ROUND0_BITS));
 }
 
 static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
-- 
GitLab


From 4caad76a74e1330fd790291e28a79e2cd8a781e1 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 May 2024 11:43:17 +0100
Subject: [PATCH 147/391] Add merged impl of 6tap
 av1_convolve_2d_sr_neon_dotprod

Merge the horizontal and vertical passes of 2D convolution for 6tap
filters, avoiding the use of an intermediate buffer. This gives around
10% uplift over the split implementation.

Change-Id: I89546369b9b04d460696f29a09bf2a62a9ea123c
---
 aom_dsp/arm/mem_neon.h                 | 15 +++++
 av1/common/arm/convolve_neon_dotprod.c | 80 ++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 46aa16e61d..1aebcf951a 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -1053,6 +1053,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
   *s7 = vld1q_u8(s);
 }
 
+static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+}
+
 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3) {
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 20da29880d..964270b363 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1276,6 +1276,80 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
   } while (--height != 0);
 }
 
+static INLINE void convolve_2d_sr_6tap_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  const int bd = 8;
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Halve the total because we halved the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
+  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int height = h;
+
+    uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
+    load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
+    s += 5 * src_stride;
+
+    int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl);
+    int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl);
+    int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl);
+    int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl);
+    int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl);
+
+    do {
+      uint8x16_t h_s5, h_s6, h_s7, h_s8;
+      load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
+
+      int16x8_t v_s5 =
+          convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl);
+      int16x8_t v_s6 =
+          convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl);
+      int16x8_t v_s7 =
+          convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl);
+      int16x8_t v_s8 =
+          convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl);
+
+      uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                      y_filter, vert_const);
+      uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                      y_filter, vert_const);
+      uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                      y_filter, vert_const);
+      uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                      y_filter, vert_const);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
                                      const InterpFilterParams *filter_params_x,
@@ -1320,6 +1394,12 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
     convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                    y_filter_0_7, y_filter_8_11);
   } else {
+    if (x_filter_taps >= 6 && y_filter_taps == 6) {
+      convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w,
+                                       h, x_filter_ptr, y_filter_ptr);
+      return;
+    }
+
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-- 
GitLab


From 069d267d7c4b0738d9a46d4be0c48b8d67e42003 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Thu, 9 May 2024 14:52:38 +0100
Subject: [PATCH 148/391] Add merged impl of 6-tap av1_convolve_2d_sr_neon_i8mm

Merge the horizontal and vertical passes of av1_convolve_2d_sr_neon_i8mm
for 6-tap filters, avoiding the use of an intermdiate buffer. This gives
around 10% uplift over the split implementation.

Change-Id: I34d5dc819bdc36f04ac172bce349257f8f7887d2
---
 av1/common/arm/convolve_neon_i8mm.c | 79 +++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index c3d4c94c77..8f54b64fa6 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -983,6 +983,79 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
   }
 }
 
+static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
+                                                 int src_stride, uint8_t *dst,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *x_filter_ptr,
+                                                 const int16_t *y_filter_ptr) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  const int bd = 8;
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // The outermost -1 is needed because we halved the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int height = h;
+
+    uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
+    load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
+    s += 5 * src_stride;
+
+    int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+
+    do {
+      uint8x16_t h_s5, h_s6, h_s7, h_s8;
+      load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
+
+      int16x8_t v_s5 =
+          convolve8_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+      int16x8_t v_s6 =
+          convolve8_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+      int16x8_t v_s7 =
+          convolve8_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const);
+      int16x8_t v_s8 =
+          convolve8_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const);
+
+      uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                      y_filter, vert_const);
+      uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                      y_filter, vert_const);
+      uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                      y_filter, vert_const);
+      uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                      y_filter, vert_const);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
 void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
@@ -1029,6 +1102,12 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
+    if (y_filter_taps == 6 && x_filter_taps >= 6) {
+      convolve_2d_sr_6tap_neon_i8mm(src_ptr, src_stride, dst, dst_stride, w, h,
+                                    x_filter_ptr, y_filter_ptr);
+      return;
+    }
+
     if (x_filter_taps <= 4) {
       convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block,
                                           im_stride, w, im_h, x_filter_ptr);
-- 
GitLab


From 49d02208d85c05eb000f4326a15c1a9c5f4e5e2e Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 May 2024 15:12:57 +0100
Subject: [PATCH 149/391] Add 4-tap merged impl of
 av1_convolve_2d_sr_neon_dotprod

Merge the vertical and horizontal passes of
av1_convolve_2d_sr_neon_dotprod for 4-tap filters, avoiding the use of
an intermediate buffer. This gives around 10% uplift over the split
implementation.

Change-Id: Id8a1d16a892827109d210b34ba34043c46227e53
---
 aom_dsp/arm/mem_neon.h                 |  10 ++
 av1/common/arm/convolve_neon_dotprod.c | 131 +++++++++++++++++++++++++
 2 files changed, 141 insertions(+)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 1aebcf951a..b5deb9ca34 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -1080,6 +1080,16 @@ static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
   *s3 = vld1q_u8(s);
 }
 
+static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 964270b363..32b056dc29 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1350,6 +1350,131 @@ static INLINE void convolve_2d_sr_6tap_neon_dotprod(
   } while (w != 0);
 }
 
+static INLINE void convolve_2d_sr_4tap_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
+  const int bd = 8;
+  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
+
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+  const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t x_filter =
+      vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
+
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  // Halve the total because we halved the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction);
+    int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction);
+    int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t v_s3 =
+          convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction);
+      int16x4_t v_s4 =
+          convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction);
+      int16x4_t v_s5 =
+          convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction);
+      int16x4_t v_s6 =
+          convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction);
+
+      int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
+      int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
+      int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
+      int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      int height = h;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+      int16x8_t v_s0 =
+          convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction);
+      int16x8_t v_s1 =
+          convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction);
+      int16x8_t v_s2 =
+          convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        int16x8_t v_s3 =
+            convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction);
+        int16x8_t v_s4 =
+            convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction);
+        int16x8_t v_s5 =
+            convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction);
+        int16x8_t v_s6 =
+            convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction);
+
+        uint8x8_t d0 =
+            convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
+        uint8x8_t d1 =
+            convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
+        uint8x8_t d2 =
+            convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
+        uint8x8_t d3 =
+            convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
                                      const InterpFilterParams *filter_params_x,
@@ -1400,6 +1525,12 @@ void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
       return;
     }
 
+    if (x_filter_taps <= 4 && y_filter_taps <= 4) {
+      convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride,
+                                       w, h, x_filter_ptr, y_filter_ptr);
+      return;
+    }
+
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-- 
GitLab


From b0cc22b3e5eb371f133af3780eb02c76a10cdc37 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 10 May 2024 15:55:41 +0100
Subject: [PATCH 150/391] Add 4-tap merged impl of av1_convolve_2d_sr_neon_i8mm

Merge the horizontal and vertical passes of av1_convolve_2d_sr_neon_i8mm
for 4-tap filters, avoiding the use of an intermediate buffer. This
gives around 10% uplift over the split implementation.

Change-Id: I4dcdd8521a709649449f5dd737bb7c6cdc0c2b6f
---
 av1/common/arm/convolve_neon_i8mm.c | 130 ++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 8f54b64fa6..cd989cb1da 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1056,6 +1056,130 @@ static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
   } while (w != 0);
 }
 
+static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
+                                                 int src_stride, uint8_t *dst,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *x_filter_ptr,
+                                                 const int16_t *y_filter_ptr) {
+  const int bd = 8;
+  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
+
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+  const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t x_filter =
+      vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
+
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Halve the total because we halved the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32(
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2);
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+    int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+    int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t v_s3 =
+          convolve4_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+      int16x4_t v_s4 =
+          convolve4_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+      int16x4_t v_s5 =
+          convolve4_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+      int16x4_t v_s6 =
+          convolve4_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+
+      int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
+      int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
+      int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
+      int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      int height = h;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+      int16x8_t v_s0 =
+          convolve4_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+      int16x8_t v_s1 =
+          convolve4_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+      int16x8_t v_s2 =
+          convolve4_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        int16x8_t v_s3 =
+            convolve4_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+        int16x8_t v_s4 =
+            convolve4_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+        int16x8_t v_s5 =
+            convolve4_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+        int16x8_t v_s6 =
+            convolve4_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+
+        uint8x8_t d0 =
+            convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
+        uint8x8_t d1 =
+            convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
+        uint8x8_t d2 =
+            convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
+        uint8x8_t d3 =
+            convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
@@ -1108,6 +1232,12 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
       return;
     }
 
+    if (y_filter_taps <= 4 && x_filter_taps <= 4) {
+      convolve_2d_sr_4tap_neon_i8mm(src_ptr + 2, src_stride, dst, dst_stride, w,
+                                    h, x_filter_ptr, y_filter_ptr);
+      return;
+    }
+
     if (x_filter_taps <= 4) {
       convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block,
                                           im_stride, w, im_h, x_filter_ptr);
-- 
GitLab


From 699cefee092b8e2f35f580f44c5be709c569722f Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Wed, 15 May 2024 12:28:25 +0100
Subject: [PATCH 151/391] Fix high target data rate overflow.

These change fixes issues that can occur if the user specifies a very
high target data rate or rate per frame.

Fixes some issue with overflow of int variables used to hold bitrate
values (rate per second, rate per frame etc).

This patch also imposes a new maximum for the passed in target bitrate.
This value is passed in in kbits (so multiplied by 1000
internally).

Change-Id: Ia3b60bf1110d85cdce161492561bdda1cff61c63
---
 aom/aom_encoder.h            |  1 +
 av1/av1_cx_iface.c           |  3 ++-
 av1/encoder/encoder.h        |  2 +-
 av1/encoder/pass2_strategy.c | 16 ++++++++++------
 av1/encoder/ratectrl.c       |  9 +++++----
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 9bdadd6938..15cf21b040 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -637,6 +637,7 @@ typedef struct aom_codec_enc_cfg {
   /*!\brief Target data rate
    *
    * Target bitrate to use for this stream, in kilobits per second.
+   * Max allowed value is 2000000
    */
   unsigned int rc_target_bitrate;
 
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index c09a02c819..c26f2aafb0 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -674,6 +674,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
   RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
 
+  RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000);
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
@@ -3331,7 +3332,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       if (ppi->cpi->oxcf.pass != 1) {
         ppi->total_time_compress_data += cpi->time_compress_data;
         ppi->total_recode_hits += cpi->frame_recode_hits;
-        ppi->total_bytes += cpi->bytes;
+        ppi->total_bytes += (uint64_t)cpi->bytes;
         for (int i = 0; i < MAX_MODES; i++) {
           ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
         }
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a919bd906a..b0fc5cd78a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -2793,7 +2793,7 @@ typedef struct AV1_PRIMARY {
   double total_blockiness;
   double worst_blockiness;
 
-  int total_bytes;
+  uint64_t total_bytes;
   double summed_quality;
   double summed_weights;
   double summed_quality_hbd;
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index bd8620c2be..8618212f66 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -268,9 +268,12 @@ static double calc_correction_factor(double err_per_mb, int q) {
 
 // Similar to find_qindex_by_rate() function in ratectrl.c, but includes
 // calculation of a correction_factor.
-static int find_qindex_by_rate_with_correction(
-    int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
-    double group_weight_factor, int best_qindex, int worst_qindex) {
+static int find_qindex_by_rate_with_correction(uint64_t desired_bits_per_mb,
+                                               aom_bit_depth_t bit_depth,
+                                               double error_per_mb,
+                                               double group_weight_factor,
+                                               int best_qindex,
+                                               int worst_qindex) {
   assert(best_qindex <= worst_qindex);
   int low = best_qindex;
   int high = worst_qindex;
@@ -279,7 +282,8 @@ static int find_qindex_by_rate_with_correction(
     const int mid = (low + high) >> 1;
     const double q_factor = calc_correction_factor(error_per_mb, mid);
     const double q = av1_convert_qindex_to_q(mid, bit_depth);
-    const int mid_bits_per_mb = (int)((q_factor * group_weight_factor) / q);
+    const uint64_t mid_bits_per_mb =
+        (uint64_t)((q_factor * group_weight_factor) / q);
 
     if (mid_bits_per_mb > desired_bits_per_mb) {
       low = mid + 1;
@@ -328,8 +332,8 @@ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
                             : cpi->common.mi_params.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
-    const int target_norm_bits_per_mb =
-        (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+    const uint64_t target_norm_bits_per_mb =
+        ((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
     int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
 
     // Update bpm correction factor based on previous GOP rate error.
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 9daeb45c89..a32e0b5ba7 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2550,7 +2550,7 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
 void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  int vbr_max_bits;
+  int64_t vbr_max_bits;
   const int MBs = av1_get_MBs(width, height);
 
   rc->avg_frame_bandwidth =
@@ -2569,10 +2569,11 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   vbr_max_bits =
-      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
-            100);
+      ((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) / 100;
+  vbr_max_bits = (vbr_max_bits < INT_MAX) ? vbr_max_bits : INT_MAX;
+
   rc->max_frame_bandwidth =
-      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits);
 
   av1_rc_set_gf_interval_range(cpi, rc);
 }
-- 
GitLab


From 22bcd3de8f4fc43c4e87ef9dedfb40594fb1569b Mon Sep 17 00:00:00 2001
From: Debargha Mukherjee <debargha@google.com>
Date: Thu, 23 May 2024 01:24:40 +0000
Subject: [PATCH 152/391] Increase scaling in linsolve_wiener

Reduces likelihood of overflows and re-enables previously
disabled overflow tests.

Bug: b:330639949, oss-fuzz:68195

Change-Id: I2c3dc9bd5783836d65b3c7a4452061b27cf4c82c
---
 av1/encoder/pickrst.c | 2 +-
 test/wiener_test.cc   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 7c30e3a9d9..0b0ca1c8e0 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1179,7 +1179,7 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
       if (abs_akj > max_abs_akj) max_abs_akj = abs_akj;
     }
     const int scale_threshold = 1 << 22;
-    const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5);
+    const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 6);
     const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7);
     const int scaler = scaler_c * scaler_A;
 
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index d018d8021b..4508af227f 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -1721,7 +1721,7 @@ TEST(SearchWienerTest, 8bitSignedIntegerOverflowInUpdateBSepSym) {
 
 // A test that reproduces crbug.com/oss-fuzz/68195: signed integer overflow in
 // linsolve_wiener().
-TEST(SearchWienerTest, DISABLED_8bitSignedIntegerOverflowInLinsolveWiener) {
+TEST(SearchWienerTest, 8bitSignedIntegerOverflowInLinsolveWiener) {
   constexpr int kWidth = 4;
   constexpr int kHeight = 3;
   constexpr unsigned char kBuffer[kWidth * kHeight] = {
@@ -1889,7 +1889,7 @@ TEST(SearchWienerTest, 10bitSignedIntegerOverflowInLinsolveWiener) {
 
 // A test that reproduces b/330639949: signed integer overflow in
 // linsolve_wiener().
-TEST(SearchWienerTest, DISABLED_12bitSignedIntegerOverflowInLinsolveWiener) {
+TEST(SearchWienerTest, 12bitSignedIntegerOverflowInLinsolveWiener) {
   constexpr int kWidth = 173;
   constexpr int kHeight = 3;
   // Since the image format is YUV 4:2:0, aom_img_wrap() expects the buffer is
-- 
GitLab


From eac59789e01f137d94dcb39a03eaf33db2870dec Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 22 May 2024 15:49:43 -0700
Subject: [PATCH 153/391] Fix two UBSan errors in av1_rc_update_framerate()

Fix UBSan errors in the calculations of rc->avg_frame_bandwidth and
rc->min_frame_bandwidth in av1_rc_update_framerate().

Bug: aomedia:3509
Change-Id: I3e3d560444b12b4911bc2317ae32f0e3cad8a505
---
 av1/encoder/ratectrl.c  | 21 +++++++------
 test/encode_api_test.cc | 70 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index a32e0b5ba7..af707be5b5 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2550,16 +2550,17 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
 void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  int64_t vbr_max_bits;
   const int MBs = av1_get_MBs(width, height);
 
-  rc->avg_frame_bandwidth =
-      (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
-  rc->min_frame_bandwidth =
-      (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
+  const double avg_frame_bandwidth =
+      round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth = (int)AOMMIN(avg_frame_bandwidth, INT_MAX);
 
-  rc->min_frame_bandwidth =
-      AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+  int64_t vbr_min_bits =
+      (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100;
+  vbr_min_bits = AOMMIN(vbr_min_bits, INT_MAX);
+
+  rc->min_frame_bandwidth = AOMMAX((int)vbr_min_bits, FRAME_OVERHEAD_BITS);
 
   // A maximum bitrate for a frame is defined.
   // The baseline for this aligns with HW implementations that
@@ -2568,9 +2569,9 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   // a very high rate is given on the command line or the the rate cannnot
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
-  vbr_max_bits =
-      ((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) / 100;
-  vbr_max_bits = (vbr_max_bits < INT_MAX) ? vbr_max_bits : INT_MAX;
+  int64_t vbr_max_bits =
+      (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section / 100;
+  vbr_max_bits = AOMMIN(vbr_max_bits, INT_MAX);
 
   rc->max_frame_bandwidth =
       AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits);
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 27bcbc14c1..379d8d6821 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -776,6 +776,76 @@ TEST(EncodeAPI, Buganizer339877165) {
   aom_codec_destroy(&enc);
 }
 
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection2Percent) {
+  // Initialize libaom encoder.
+  aom_codec_iface_t *const iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+            AOM_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 1 percent to cause a signed integer overflow in the
+  // multiplication rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section in
+  // av1_rc_update_framerate() if the multiplication is done in the `int` type.
+  cfg.rc_2pass_vbr_minsection_pct = 2;
+
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  // Create input image.
+  aom_image_t *const image =
+      CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, /*duration=*/300, 0),
+            AOM_CODEC_OK);
+
+  // Free resources.
+  aom_img_free(image);
+  ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection101Percent) {
+  // Initialize libaom encoder.
+  aom_codec_iface_t *const iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+            AOM_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 100 percent to cause an error when vbr_min_bits is
+  // cast to `int` in av1_rc_update_framerate() if vbr_min_bits is not clamped
+  // to INT_MAX.
+  cfg.rc_2pass_vbr_minsection_pct = 101;
+
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  // Create input image.
+  aom_image_t *const image =
+      CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, /*duration=*/300, 0),
+            AOM_CODEC_OK);
+
+  // Free resources.
+  aom_img_free(image);
+  ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
 class EncodeAPIParameterized
     : public testing::TestWithParam<std::tuple<
           /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
-- 
GitLab


From ea4e993f7be4fc956b8293b0ca258f873894466c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 22 May 2024 16:01:04 -0700
Subject: [PATCH 154/391] Fix a typo in CpuSpeedTest::TestTuneScreen()

Change the second rc_2pass_vbr_minsection_pct to
rc_2pass_vbr_maxsection_pct.

This copy-and-paste error was introduced in
https://chromium-review.googlesource.com/c/webm/libvpx/+/332653.

Change-Id: I3469fa7d23562f0e2217c6be0d7e045f359f2022
---
 test/cpu_speed_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index b5f5d2974d..972d800270 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -107,7 +107,7 @@ void CpuSpeedTest::TestTuneScreen() {
   ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 2000;
   cfg_.rc_max_quantizer = 63;
   cfg_.rc_min_quantizer = 0;
-- 
GitLab


From c1c1a716fcbe67ab3bf9c8ad4b6ac647e84d32b0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 23 May 2024 15:16:32 -0700
Subject: [PATCH 155/391] {aarch*,arm}_cpudetect: align define with comment

ANDROID_USE_CPU_FEATURES_LIB -> AOM_USE_ANDROID_CPU_FEATURES

Change-Id: I2d425cf3cd28219e570efb0c442b33f1a64447ae
---
 aom_ports/aarch32_cpudetect.c | 2 +-
 aom_ports/aarch64_cpudetect.c | 2 +-
 aom_ports/arm_cpudetect.h     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/aom_ports/aarch32_cpudetect.c b/aom_ports/aarch32_cpudetect.c
index 753f957112..809bae5920 100644
--- a/aom_ports/aarch32_cpudetect.c
+++ b/aom_ports/aarch32_cpudetect.c
@@ -44,7 +44,7 @@ static int arm_get_cpu_caps(void) {
   return flags;
 }
 
-#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c
index 159e5b1008..e356763901 100644
--- a/aom_ports/aarch64_cpudetect.c
+++ b/aom_ports/aarch64_cpudetect.c
@@ -89,7 +89,7 @@ static int arm_get_cpu_caps(void) {
   return flags;
 }
 
-#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
diff --git a/aom_ports/arm_cpudetect.h b/aom_ports/arm_cpudetect.h
index 33c2d1bb6a..2b63942424 100644
--- a/aom_ports/arm_cpudetect.h
+++ b/aom_ports/arm_cpudetect.h
@@ -32,7 +32,7 @@
 #endif
 
 #if defined(__ANDROID__) && (__ANDROID_API__ < 18)
-#define ANDROID_USE_CPU_FEATURES_LIB 1
+#define AOM_USE_ANDROID_CPU_FEATURES 1
 // Use getauxval() when targeting (64-bit) Android with API level >= 18.
 // getauxval() is supported since Android API level 18 (Android 4.3.)
 // First Android version with 64-bit support was Android 5.x (API level 21).
-- 
GitLab


From 6f8189bb64bbba13c0c7dd69231a3f44b6ac581b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 17 May 2024 19:30:16 -0700
Subject: [PATCH 156/391] x86/synonyms.h: include emmintrin.h not immintrin.h

The latter is meant for AVX(2); this file only uses SSE2 intrinsics.

+ fix includes in some consumers of this header that were relying on
  immintrin.h to provide <= AVX2 intrinsics

Change-Id: I784b89feb630a2bad1e727ec66ccc41526ebd357
---
 aom_dsp/x86/masked_sad_intrin_avx2.c | 2 +-
 aom_dsp/x86/synonyms.h               | 2 +-
 av1/encoder/x86/pickrst_sse4.c       | 2 +-
 av1/encoder/x86/rdopt_sse4.c         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index 2c022555b5..f3751c7cb0 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <tmmintrin.h>
+#include <immintrin.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index ab13446b8d..0914b98b79 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -12,7 +12,7 @@
 #ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
 #define AOM_AOM_DSP_X86_SYNONYMS_H_
 
-#include <immintrin.h>
+#include <emmintrin.h>
 #include <string.h>
 
 #include "config/aom_config.h"
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 3617d33fef..af6706228a 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -10,7 +10,7 @@
  */
 
 #include <assert.h>
-#include <emmintrin.h>
+#include <smmintrin.h>
 #include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 12ac146195..af61df102b 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -10,7 +10,7 @@
  */
 
 #include <assert.h>
-#include <emmintrin.h>
+#include <smmintrin.h>
 #include "aom_dsp/x86/synonyms.h"
 
 #include "config/av1_rtcd.h"
-- 
GitLab


From bdada5c710f882be00ad397da17733eeb4e81918 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 22 May 2024 14:30:55 -0700
Subject: [PATCH 157/391] Reduce encode_time on slide change for
 non_reference_frame

For rtc-screen: for slide changes that occur on a non_reference frame
(i.e, top temporal enhancement layer frames): add a speed feature to
skip encode the frame. This option is currently set as default.

Regardless of this option: adjust some speed features and turn off
palette to significantly reduce the encode_time spike for that frame,
if it's not skip encoded.

Change-Id: I70cefdfa7d158fd53e8c4720c7266fdd24fe9635
---
 av1/encoder/nonrd_pickmode.c | 46 ++++++++++++++++++++++++++----------
 av1/encoder/speed_features.c |  8 +++++++
 av1/encoder/speed_features.h |  3 +++
 av1/encoder/var_based_part.c | 12 ++++++++++
 4 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 45d81aa9d5..317d5c7e66 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2494,7 +2494,8 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
     return true;
 
   // For screen content: skip mode testing based on source_sad.
-  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      !x->force_zeromv_skip_for_blk) {
     // If source_sad is computed: skip non-zero motion
     // check for stationary (super)blocks. Otherwise if superblock
     // has motion skip the modes with zero motion on last reference
@@ -3113,6 +3114,34 @@ static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
   }
 }
 
+static AOM_INLINE bool enable_palette(AV1_COMP *cpi, bool is_mode_intra,
+                                      BLOCK_SIZE bsize,
+                                      unsigned int source_variance,
+                                      int force_zeromv_skip,
+                                      int skip_idtx_palette,
+                                      int force_palette_test) {
+  if (!cpi->oxcf.tool_cfg.enable_palette) return false;
+  if (!av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                         bsize)) {
+    return false;
+  }
+  if (skip_idtx_palette) return false;
+
+  if (cpi->sf.rt_sf.prune_palette_search_nonrd > 1 &&
+      ((cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) ||
+       bsize > BLOCK_16X16)) {
+    return false;
+  }
+
+  if ((is_mode_intra || force_palette_test) && source_variance > 0 &&
+      !force_zeromv_skip &&
+      (cpi->rc.high_source_sad || source_variance > 300)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 /*!\brief AV1 inter mode selection based on Non-RD optimized model.
  *
  * \ingroup nonrd_mode_search
@@ -3468,18 +3497,9 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                           x->content_state_sb.source_sad_nonrd != kZeroSad &&
                           !cpi->rc.high_source_sad;
 
-  int try_palette =
-      !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
-      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                        mi->bsize);
-  try_palette =
-      try_palette &&
-      (is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
-      x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
-      (cpi->rc.high_source_sad || x->source_variance > 300);
-
-  if (rt_sf->prune_palette_search_nonrd > 1 && bsize > BLOCK_16X16)
-    try_palette = 0;
+  bool try_palette = enable_palette(
+      cpi, is_mode_intra(best_pickmode->best_mode), bsize, x->source_variance,
+      x->force_zeromv_skip_for_blk, skip_idtx_palette, force_palette_test);
 
   // Perform screen content mode evaluation for non-rd
   handle_screen_content_mode_nonrd(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index a788af811b..a65ac3091b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1592,6 +1592,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
       sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
     }
+    sf->rt_sf.skip_encoding_non_reference_slide_change = 1;
     sf->rt_sf.skip_newmv_flat_blocks_screen = 1;
     sf->rt_sf.use_idtx_nonrd = 1;
     sf->rt_sf.higher_thresh_scene_detection = 0;
@@ -1616,6 +1617,12 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->mv_sf.search_method = NSTEP;
       sf->rt_sf.fullpel_search_step_param = 2;
     }
+    if (cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) {
+      sf->rt_sf.use_idtx_nonrd = 0;
+      sf->rt_sf.prefer_large_partition_blocks = 1;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      sf->rt_sf.fullpel_search_step_param = 10;
+    }
     sf->rt_sf.partition_direct_merging = 0;
     sf->hl_sf.accurate_bit_estimate = 0;
     // This feature is for nonrd_pickmode.
@@ -2311,6 +2318,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->force_only_last_ref = 0;
   rt_sf->higher_thresh_scene_detection = 1;
   rt_sf->skip_newmv_flat_blocks_screen = 0;
+  rt_sf->skip_encoding_non_reference_slide_change = 0;
 }
 
 static fractional_mv_step_fp
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 77e66228a8..c768ff3944 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1916,6 +1916,9 @@ typedef struct REAL_TIME_SPEED_FEATURES {
 
   // FLag to indicate skip testing of NEWMV for flat blocks.
   int skip_newmv_flat_blocks_screen;
+
+  // Flag to force skip encoding for non_reference_frame on slide/scene changes.
+  int skip_encoding_non_reference_slide_change;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index f664795153..2c9772dddb 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -27,6 +27,7 @@
 #include "av1/common/blockd.h"
 
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/var_based_part.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/rdopt_utils.h"
@@ -1620,6 +1621,17 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   unsigned int y_sad_last = UINT_MAX;
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
 
+  // Force skip encoding for all superblocks on slide change for
+  // non_reference_frames.
+  if (cpi->sf.rt_sf.skip_encoding_non_reference_slide_change &&
+      cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) {
+    MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                        get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+    av1_set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+    x->force_zeromv_skip_for_sb = 1;
+    return 0;
+  }
+
   // Ref frame used in partitioning.
   MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
 
-- 
GitLab


From 249c4c48d3763cdf81413293461503533408830b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 25 May 2024 09:33:19 -0700
Subject: [PATCH 158/391] synonyms.h: s/_mm_loadu_si64/_mm_loadl_epi64/

_mm_loadu_si64 is defined in immintrin.h. Use _mm_loadl_epi64() instead
as the includes were changed in:
6f8189bb64 x86/synonyms.h: include emmintrin.h not immintrin.h

Fixes Visual Studio build errors:
aom_dsp/x86/synonyms.h(53,43): warning C4013: '_mm_loadu_si64'
  undefined; assuming extern returning int
aom_dsp/x86/synonyms.h(53,43): warning C4024: '_mm_unpacklo_epi64':
  different types for formal and actual parameter 1
aom_dsp/x86/synonyms.h(53,63): warning C4024: '_mm_unpacklo_epi64':
  different types for formal and actual parameter 2

Change-Id: I655d0f968889d91815b6afbca869740d091032f6
---
 aom_dsp/x86/synonyms.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index 0914b98b79..ae889ad169 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -50,7 +50,8 @@ static INLINE __m128i xx_loadu_128(const void *a) {
 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
 // the strict aliasing rule, this takes a different approach
 static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
-  return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+  return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
+                            _mm_loadl_epi64((const __m128i *)hi));
 }
 
 static INLINE void xx_storel_32(void *const a, const __m128i v) {
-- 
GitLab


From c5a71030d6adeb5f6cefc987134e335d0d79929a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 17 May 2024 19:49:19 -0700
Subject: [PATCH 159/391] rdopt_sse4.c: use xx_loadu_2x64 for unaligned loads

This quiets some undefined sanitizer warnings related to unaligned
loads; register/code reordering with gcc-13 & clang-16.

Bug: b:300649160
Change-Id: Icbea76594a70d6bd7b48c11d513f103764682fd7
---
 av1/encoder/x86/rdopt_sse4.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index af61df102b..76980d673a 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -29,10 +29,8 @@ INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
   //                      [ i j k l ]
   //                      [ m n o p ]
 
-  const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
-                                         *(int64_t *)&diff[2 * stride]);
-  const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
-                                         *(int64_t *)&diff[3 * stride]);
+  const __m128i pixelsa = xx_loadu_2x64(&diff[0 * stride], &diff[2 * stride]);
+  const __m128i pixelsb = xx_loadu_2x64(&diff[1 * stride], &diff[3 * stride]);
   // pixelsa = [d c b a l k j i] as i16
   // pixelsb = [h g f e p o n m] as i16
 
-- 
GitLab


From d00a1befd9275278994f670bf2392aa189a649bb Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Tue, 30 Apr 2024 13:52:22 +0200
Subject: [PATCH 160/391] Refactor ConvolveScaleTest unit tests

The filter values used in the scaling algorithm are specified in the
documentation of the inter-prediction process (chapter 7.11.3.4. [1])
which process contains the scaling algorithm. An important
characteristic of these filter values is that all of them are even
and we can leverage this information when optimising the
av1_convolve_2d_scale function. However the unit tests do not follow
this pattern, the test currently uses calculations to generate filter
values instead.

To enable us to take advantage of specific filter characteristics,
adjust the unit tests to use the predefined filter values of
av1_interp_filter_params_list defined in filter.h file.

Additionally, update convolutions parameters when testing with
different is_compound values.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I6ed1f7b269d139e221cbe35a573e06f0e2adaf59
---
 test/av1_convolve_scale_test.cc | 232 ++++++++++----------------------
 1 file changed, 74 insertions(+), 158 deletions(-)

diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index 76cf77ab07..764ac2fac6 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -22,6 +22,7 @@
 #include "test/util.h"
 
 #include "av1/common/common_data.h"
+#include "av1/common/filter.h"
 
 namespace {
 const int kTestIters = 10;
@@ -32,80 +33,12 @@ const int kHPad = 32;
 const int kXStepQn = 16;
 const int kYStepQn = 20;
 
+const int kNumFilterBanks = SWITCHABLE_FILTERS;
+
 using libaom_test::ACMRandom;
 using std::make_tuple;
 using std::tuple;
 
-enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
-int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
-
-// A 16-bit filter with a configurable number of taps.
-class TestFilter {
- public:
-  void set(NTaps ntaps, bool backwards);
-
-  InterpFilterParams params_;
-
- private:
-  std::vector<int16_t> coeffs_;
-};
-
-void TestFilter::set(NTaps ntaps, bool backwards) {
-  const int n = NTapsToInt(ntaps);
-  assert(n >= 8 && n <= 12);
-
-  // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus
-  // elements at the end so that convolutions can read off the end safely.
-  coeffs_.resize(n * SUBPEL_SHIFTS + 8);
-
-  // The coefficients are pretty much arbitrary, but convolutions shouldn't
-  // over or underflow. For the first filter (subpels = 0), we use an
-  // increasing or decreasing ramp (depending on the backwards parameter). We
-  // don't want any zero coefficients, so we make it have an x-intercept at -1
-  // or n. To ensure absence of under/overflow, we normalise the area under the
-  // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function
-  // gives the identity).
-  //
-  // When increasing, the function has the form:
-  //
-  //   f(x) = A * (x + 1)
-  //
-  // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the
-  // filter is reversed, we have the same A but with formula
-  //
-  //   g(x) = A * (n - x)
-  const int I = 1 << FILTER_BITS;
-  const float A = 2.f * I / (n * (n + 1.f));
-  for (int i = 0; i < n; ++i) {
-    coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1)));
-  }
-
-  // For the other filters, make them slightly different by swapping two
-  // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped.
-  const size_t filter_size = sizeof(coeffs_[0] * n);
-  int16_t *const filter0 = &coeffs_[0];
-  for (int k = 1; k < SUBPEL_SHIFTS; ++k) {
-    int16_t *filterk = &coeffs_[k * n];
-    memcpy(filterk, filter0, filter_size);
-
-    const int idx0 = k % n;
-    const int idx1 = (7 * k) % n;
-
-    const int16_t tmp = filterk[idx0];
-    filterk[idx0] = filterk[idx1];
-    filterk[idx1] = tmp;
-  }
-
-  // Finally, write some rubbish at the end to make sure we don't use it.
-  for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i;
-
-  // Fill in params
-  params_.filter_ptr = &coeffs_[0];
-  params_.taps = n;
-  // These are ignored by the functions being tested. Set them to whatever.
-  params_.interp_filter = EIGHTTAP_REGULAR;
-}
-
 template <typename SrcPixel>
 class TestImage {
  public:
@@ -244,14 +177,9 @@ void TestImage<SrcPixel>::Check() const {
 typedef tuple<int, int> BlockDimension;
 
 struct BaseParams {
-  BaseParams(BlockDimension dimensions, NTaps num_taps_x, NTaps num_taps_y,
-             bool average)
-      : dims(dimensions), ntaps_x(num_taps_x), ntaps_y(num_taps_y),
-        avg(average) {}
+  BaseParams(BlockDimension dimensions) : dims(dimensions) {}
 
   BlockDimension dims;
-  NTaps ntaps_x, ntaps_y;
-  bool avg;
 };
 
 template <typename SrcPixel>
@@ -271,54 +199,62 @@ class ConvolveScaleTestBase : public ::testing::Test {
   void SetParams(const BaseParams &params, int bd) {
     width_ = std::get<0>(params.dims);
     height_ = std::get<1>(params.dims);
-    ntaps_x_ = params.ntaps_x;
-    ntaps_y_ = params.ntaps_y;
     bd_ = bd;
-    avg_ = params.avg;
-
-    filter_x_.set(ntaps_x_, false);
-    filter_y_.set(ntaps_y_, true);
-    convolve_params_ =
-        get_conv_params_no_round(avg_ != false, 0, nullptr, 0, 1, bd);
 
     delete image_;
     image_ = new TestImage<SrcPixel>(width_, height_, bd_);
     ASSERT_NE(image_, nullptr);
   }
 
-  void SetConvParamOffset(int i, int j, int is_compound, int do_average,
-                          int use_dist_wtd_comp_avg) {
-    if (i == -1 && j == -1) {
-      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
-      convolve_params_.is_compound = is_compound;
-      convolve_params_.do_average = do_average;
-    } else {
-      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
-      convolve_params_.fwd_offset = quant_dist_lookup_table[j][i];
-      convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i];
-      convolve_params_.is_compound = is_compound;
-      convolve_params_.do_average = do_average;
+  std::vector<ConvolveParams> GetConvParams() {
+    std::vector<ConvolveParams> convolve_params;
+
+    ConvolveParams param_no_compound =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, bd_);
+    convolve_params.push_back(param_no_compound);
+
+    ConvolveParams param_compound_avg =
+        get_conv_params_no_round(1, 0, nullptr, 0, 1, bd_);
+    convolve_params.push_back(param_compound_avg);
+
+    ConvolveParams param_compound_avg_dist_wtd = param_compound_avg;
+    param_compound_avg_dist_wtd.use_dist_wtd_comp_avg = 1;
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        param_compound_avg_dist_wtd.fwd_offset = quant_dist_lookup_table[j][i];
+        param_compound_avg_dist_wtd.bck_offset =
+            quant_dist_lookup_table[j][1 - i];
+        convolve_params.push_back(param_compound_avg_dist_wtd);
+      }
     }
+
+    return convolve_params;
   }
 
   void Run() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
+    std::vector<ConvolveParams> conv_params = GetConvParams();
+
     for (int i = 0; i < kTestIters; ++i) {
-      int is_compound = 0;
-      SetConvParamOffset(-1, -1, is_compound, 0, 0);
-      Prep(&rnd);
-      RunOne(true);
-      RunOne(false);
-      image_->Check();
-
-      is_compound = 1;
-      for (int do_average = 0; do_average < 2; do_average++) {
-        for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2;
-             use_dist_wtd_comp_avg++) {
-          for (int j = 0; j < 2; ++j) {
-            for (int k = 0; k < 4; ++k) {
-              SetConvParamOffset(j, k, is_compound, do_average,
-                                 use_dist_wtd_comp_avg);
+      for (int subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS;
+           ++subpel_search) {
+        for (int filter_bank_y = 0; filter_bank_y < kNumFilterBanks;
+             ++filter_bank_y) {
+          const InterpFilter filter_y =
+              static_cast<InterpFilter>(filter_bank_y);
+          filter_y_ =
+              av1_get_interp_filter_params_with_block_size(filter_y, width_);
+
+          for (int filter_bank_x = 0; filter_bank_x < kNumFilterBanks;
+               ++filter_bank_x) {
+            const InterpFilter filter_x =
+                static_cast<InterpFilter>(filter_bank_x);
+            filter_x_ =
+                av1_get_interp_filter_params_with_block_size(filter_x, width_);
+
+            for (const auto c : conv_params) {
+              convolve_params_ = c;
               Prep(&rnd);
               RunOne(true);
               RunOne(false);
@@ -329,7 +265,6 @@ class ConvolveScaleTestBase : public ::testing::Test {
       }
     }
   }
-
   void SpeedTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     Prep(&rnd);
@@ -370,8 +305,8 @@ class ConvolveScaleTestBase : public ::testing::Test {
     assert(rnd);
 
     // Choose subpel_x_ and subpel_y_. They should be less than
-    // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting"
-    // values: 0 and SCALE_SUBPEL_SHIFTS - 1
+    // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to
+    // "interesting" values: 0 and SCALE_SUBPEL_SHIFTS - 1
     subpel_x_ = RandomSubpel(rnd);
     subpel_y_ = RandomSubpel(rnd);
 
@@ -379,10 +314,8 @@ class ConvolveScaleTestBase : public ::testing::Test {
   }
 
   int width_, height_, bd_;
-  NTaps ntaps_x_, ntaps_y_;
-  bool avg_;
   int subpel_x_, subpel_y_;
-  TestFilter filter_x_, filter_y_;
+  const InterpFilterParams *filter_x_, *filter_y_;
   TestImage<SrcPixel> *image_;
   ConvolveParams convolve_params_;
 };
@@ -398,9 +331,8 @@ typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
                                   ConvolveParams *conv_params);
 
 // Test parameter list:
-//  <tst_fun, dims, ntaps_x, ntaps_y, avg>
-typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
-    LowBDParams;
+//  <tst_fun, dims, avg>
+typedef tuple<LowbdConvolveFunc, BlockDimension> LowBDParams;
 
 class LowBDConvolveScaleTest
     : public ConvolveScaleTestBase<uint8_t>,
@@ -412,12 +344,9 @@ class LowBDConvolveScaleTest
     tst_fun_ = GET_PARAM(0);
 
     const BlockDimension &block = GET_PARAM(1);
-    const NTaps ntaps_x = GET_PARAM(2);
-    const NTaps ntaps_y = GET_PARAM(3);
     const int bd = 8;
-    const bool avg = GET_PARAM(4);
 
-    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+    SetParams(BaseParams(block), bd);
   }
 
   void RunOne(bool ref) override {
@@ -428,12 +357,12 @@ class LowBDConvolveScaleTest
     const int dst_stride = image_->dst_stride();
     if (ref) {
       av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
-                              &filter_x_.params_, &filter_y_.params_, subpel_x_,
-                              kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
+                              filter_x_, filter_y_, subpel_x_, kXStepQn,
+                              subpel_y_, kYStepQn, &convolve_params_);
     } else {
-      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
-               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
-               subpel_y_, kYStepQn, &convolve_params_);
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_, filter_x_,
+               filter_y_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+               &convolve_params_);
     }
   }
 
@@ -450,25 +379,19 @@ const BlockDimension kBlockDim[] = {
   make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
 };
 
-const NTaps kNTaps[] = { EIGHT_TAP };
-
 TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
 TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
 
 INSTANTIATE_TEST_SUITE_P(
     C, LowBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_convolve_2d_scale_c),
-                       ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool()));
+                       ::testing::ValuesIn(kBlockDim)));
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, LowBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
-                       ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool()));
+                       ::testing::ValuesIn(kBlockDim)));
 #endif  // HAVE_SSE4_1
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -481,9 +404,8 @@ typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
                                    ConvolveParams *conv_params, int bd);
 
 // Test parameter list:
-//  <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
-typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
-    HighBDParams;
+//  <tst_fun, dims, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, int> HighBDParams;
 
 class HighBDConvolveScaleTest
     : public ConvolveScaleTestBase<uint16_t>,
@@ -495,12 +417,9 @@ class HighBDConvolveScaleTest
     tst_fun_ = GET_PARAM(0);
 
     const BlockDimension &block = GET_PARAM(1);
-    const NTaps ntaps_x = GET_PARAM(2);
-    const NTaps ntaps_y = GET_PARAM(3);
-    const bool avg = GET_PARAM(4);
-    const int bd = GET_PARAM(5);
+    const int bd = GET_PARAM(2);
 
-    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+    SetParams(BaseParams(block), bd);
   }
 
   void RunOne(bool ref) override {
@@ -511,14 +430,14 @@ class HighBDConvolveScaleTest
     const int dst_stride = image_->dst_stride();
 
     if (ref) {
-      av1_highbd_convolve_2d_scale_c(
-          src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
-          &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
-          &convolve_params_, bd_);
+      av1_highbd_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_,
+                                     height_, filter_x_, filter_y_, subpel_x_,
+                                     kXStepQn, subpel_y_, kYStepQn,
+                                     &convolve_params_, bd_);
     } else {
-      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
-               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
-               subpel_y_, kYStepQn, &convolve_params_, bd_);
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_, filter_x_,
+               filter_y_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+               &convolve_params_, bd_);
     }
   }
 
@@ -535,16 +454,14 @@ INSTANTIATE_TEST_SUITE_P(
     C, HighBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_c),
                        ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+                       ::testing::ValuesIn(kBDs)));
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, HighBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
                        ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+                       ::testing::ValuesIn(kBDs)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_NEON
@@ -552,8 +469,7 @@ INSTANTIATE_TEST_SUITE_P(
     NEON, HighBDConvolveScaleTest,
     ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_neon),
                        ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+                       ::testing::ValuesIn(kBDs)));
 
 #endif  // HAVE_NEON
 
-- 
GitLab


From 81621c69dca2d5984e2c4999b2911e520c36bb85 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Thu, 9 May 2024 15:25:45 +0200
Subject: [PATCH 161/391] Add Neon implementation for av1_convolve_2d_scale

Add Neon implementation for av1_convolve_2d_scale and the
corresponding tests as well.

Change-Id: I05d56b100a44bd289979ccccd2a67138e74fe82e
---
 av1/av1.cmake                            |   1 +
 av1/common/arm/av1_convolve_scale_neon.c | 702 +++++++++++++++++++++++
 av1/common/av1_rtcd_defs.pl              |   2 +-
 test/av1_convolve_scale_test.cc          |   7 +
 4 files changed, 711 insertions(+), 1 deletion(-)
 create mode 100644 av1/common/arm/av1_convolve_scale_neon.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index f1e9bc8049..232e00ff27 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -384,6 +384,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
             "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
             "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
new file mode 100644
index 0000000000..b8d934e796
--- /dev/null
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t compound_convolve8_4_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE int16x8_t compound_convolve8_8_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+  return vcombine_s16(res0, res1);
+}
+
+static INLINE void compound_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      vst1_u16(dst, vreinterpret_u16_s16(d0));
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint16_t *d = dst;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        vst1q_u16(d, vreinterpretq_u16_s16(d0));
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_avg_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts
+  // on modern CPUs.
+  const int32_t vert_offset_bits =
+      (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
+  // For the averaging code path substract round offset and convolve round.
+  const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
+  const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int16x4_t avg = vhadd_s16(dd0, d0);
+      int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
+
+      uint8x8_t d0_u8 = vqrshrun_n_s16(
+          d0_s16, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int16x8_t avg = vhaddq_s16(dd0, d0);
+
+        uint8x8_t d0_u8 = vqrshrun_n_s16(
+            avg, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  int y_qn = subpel_y_qn;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+  // For the weighted averaging code path we have to substract round offset and
+  // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
+  // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
+  // additional shift by DIST_PRECISION_BITS is needed in order to merge two
+  // shift calculations into one.
+  const int32x4_t dist_wtd_offset = vdupq_n_s32(
+      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
+             DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
+  const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
+  const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
+      dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
+
+      int16x4_t d0_s16 = vshrn_n_s32(
+          dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                           DIST_PRECISION_BITS);
+
+      uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int32x4_t dst_wtd_avg0 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
+        int32x4_t dst_wtd_avg1 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
+
+        dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
+        dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
+
+        int16x4_t d0_s16_0 = vshrn_n_s32(
+            dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+        int16x4_t d0_s16_1 = vshrn_n_s32(
+            dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+
+        uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
+}
+
+static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res0, res1));
+}
+
+static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
+                                            uint8_t *dst, int dst_stride, int w,
+                                            int h, const int16_t *y_filter,
+                                            int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
+  // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
+  int32x4_t vert_offset =
+      vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
+
+  int y_qn = subpel_y_qn;
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      store_u8_4x1(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else if (w == 8) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      vst1_u8(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+      uint8_t *d = dst;
+      int width = w;
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      do {
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
+                     &s5[0], &s6[0], &s7[0]);
+        load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
+                     &s5[1], &s6[1], &s7[1]);
+
+        uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                     s6[0], s7[0], filter, vert_offset);
+        uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                     s6[1], s7[1], filter, vert_offset);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int32x4_t sum = horiz_const;
+  sum = vmlal_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
+
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int16x8_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
+                                             int16_t *dst, int dst_stride,
+                                             int w, int h,
+                                             const int16_t *x_filter,
+                                             const int subpel_x_qn,
+                                             const int x_step_qn) {
+  DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
+  const int bd = 8;
+
+  if (w == 4) {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+        const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+        int16x4_t d0 =
+            convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+
+        vst1_s16(&temp[r * 4], d0);
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      int16x4_t d0, d1, d2, d3;
+      load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+
+    do {
+      int x_qn = subpel_x_qn;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; ++r) {
+          const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
+
+          const ptrdiff_t filter_offset =
+              SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+          int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+          // Filter values are all even so halve them to allow convolution
+          // kernel computations to stay in 16-bit element types.
+          filter = vshrq_n_s16(filter, 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                                 &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter,
+                                       horiz_offset);
+
+          vst1q_s16(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int x_step_qn,
+                                const int subpel_y_qn, const int y_step_qn,
+                                ConvolveParams *conv_params) {
+  if (w < 4 || h < 4) {
+    av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, filter_params_y, subpel_x_qn,
+                            x_step_qn, subpel_y_qn, y_step_qn, conv_params);
+    return;
+  }
+
+  // For the interpolation 8-tap filters are used.
+  assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
+
+  DECLARE_ALIGNED(32, int16_t,
+                  im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
+
+  // Horizontal filter
+  convolve_horiz_scale_neon(
+      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  // Vertical filter
+  if (UNLIKELY(conv_params->is_compound)) {
+    if (conv_params->do_average) {
+      if (conv_params->use_dist_wtd_comp_avg) {
+        compound_dist_wtd_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+      } else {
+        compound_avg_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      compound_convolve_vert_scale_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
+  } else {
+    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             filter_params_y->filter_ptr, subpel_y_qn,
+                             y_step_qn);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b597e9a1d9..7c4f539ee6 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -608,7 +608,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_convolve_x_sr_intrabc neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_y_sr_intrabc neon/;
-  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_convolve_2d_scale sse4_1 neon/;
   specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
   specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index 764ac2fac6..a428f41f53 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -387,6 +387,13 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(::testing::Values(av1_convolve_2d_scale_c),
                        ::testing::ValuesIn(kBlockDim)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon),
+                       ::testing::ValuesIn(kBlockDim)));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, LowBDConvolveScaleTest,
-- 
GitLab


From 4aefb9325a25dbc2d818d84c06a976ebd3fe5c7d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 28 May 2024 14:24:57 -0700
Subject: [PATCH 162/391] Detect an invalid row offset get_ls_tile_buffer()

row - offset is used as an array index, so it should not be negative.

Bug: oss-fuzz:68774
Change-Id: I0c075202da0b5007887aafde4e1a55acdd866d08
---
 av1/decoder/decodeframe.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c027308ff3..d25651229f 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2241,6 +2241,12 @@ static AOM_INLINE void get_ls_tile_buffer(
   if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
     // The remaining bits in the top byte signal the row offset
     int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+    if (offset > row) {
+      aom_internal_error(
+          error_info, AOM_CODEC_CORRUPT_FRAME,
+          "Invalid row offset in tile copy mode: row=%d offset=%d", row,
+          offset);
+    }
 
     // Currently, only use tiles in same column as reference tiles.
     copy_data = tile_buffers[row - offset][col].data;
-- 
GitLab


From 03eb247ac4646ed720dc999544dfb86d9001e68c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 28 May 2024 16:59:47 -0700
Subject: [PATCH 163/391] tools/obu_parser.cc: make some functions static

Fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Ia0f3b9cf38598b785c692f0a0c8c97cd2df71c0c
---
 tools/obu_parser.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/obu_parser.cc b/tools/obu_parser.cc
index 5716b46218..4053615f11 100644
--- a/tools/obu_parser.cc
+++ b/tools/obu_parser.cc
@@ -20,6 +20,7 @@
 #include "tools/obu_parser.h"
 
 namespace aom_tools {
+namespace {
 
 // Basic OBU syntax
 // 8 bits: Header
@@ -116,6 +117,8 @@ void PrintObuHeader(const ObuHeader *header) {
   }
 }
 
+}  // namespace
+
 bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) {
   const int kObuHeaderSizeBytes = 1;
   const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;
-- 
GitLab


From 4e1a5d4d08a216d71ecee10c25736abb2ffc2cdc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 28 May 2024 16:37:13 -0700
Subject: [PATCH 164/391] test/*.cc: make some functions static

Fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I680a7cfe59bb0f6a275942b05b857c053dc62360
---
 test/av1_txfm_test.cc         |  4 ++--
 test/film_grain_table_test.cc |  4 ++++
 test/scan_test.cc             |  4 ++--
 test/warp_filter_test_util.cc |  3 +++
 test/wiener_test.cc           | 21 +++++++++++----------
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/test/av1_txfm_test.cc b/test/av1_txfm_test.cc
index 77c0ec1071..23e260b32d 100644
--- a/test/av1_txfm_test.cc
+++ b/test/av1_txfm_test.cc
@@ -116,7 +116,7 @@ void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
 double Sqrt2 = pow(2, 0.5);
 double invSqrt2 = 1 / pow(2, 0.5);
 
-double dct_matrix(double n, double k, int size) {
+static double dct_matrix(double n, double k, int size) {
   return cos(PI * (2 * n + 1) * k / (2 * size));
 }
 
@@ -207,7 +207,7 @@ void reference_adst_1d(const double *in, double *out, int size) {
   }
 }
 
-void reference_idtx_1d(const double *in, double *out, int size) {
+static void reference_idtx_1d(const double *in, double *out, int size) {
   double scale = 0;
   if (size == 4)
     scale = Sqrt2;
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc
index 808d966feb..2c6906f73b 100644
--- a/test/film_grain_table_test.cc
+++ b/test/film_grain_table_test.cc
@@ -20,6 +20,8 @@
 #include "test/util.h"
 #include "test/video_source.h"
 
+namespace {
+
 void grain_equal(const aom_film_grain_t *expected,
                  const aom_film_grain_t *actual) {
   EXPECT_EQ(expected->apply_grain, actual->apply_grain);
@@ -73,6 +75,8 @@ void grain_equal(const aom_film_grain_t *expected,
   }
 }
 
+}  // namespace
+
 TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
   aom_film_grain_table_t table;
   memset(&table, 0, sizeof(table));
diff --git a/test/scan_test.cc b/test/scan_test.cc
index 571658ee0a..3ba39de3d5 100644
--- a/test/scan_test.cc
+++ b/test/scan_test.cc
@@ -25,8 +25,8 @@ static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
   }
 }
 
-int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
-                    SCAN_MODE mode) {
+static int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
+                           SCAN_MODE mode) {
   const int16_t *scan = scan_order->scan;
   const int16_t *iscan = scan_order->iscan;
   int dim = w + h - 1;
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index 470c980777..b7c60c2fdb 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -18,6 +18,7 @@ using std::make_tuple;
 using std::tuple;
 
 namespace libaom_test {
+namespace {
 
 int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits,
                             int rnd_gen_zeros) {
@@ -114,6 +115,8 @@ void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
   }
 }
 
+}  // namespace
+
 namespace AV1WarpFilter {
 ::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
     warp_affine_func filter) {
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 4508af227f..77d2769aaa 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -158,11 +158,11 @@ static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
   }
 }
 
-void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
-                         int16_t *d, int16_t *s, int h_start, int h_end,
-                         int v_start, int v_end, int dgd_stride, int src_stride,
-                         int64_t *M, int64_t *H,
-                         int use_downsampled_wiener_stats) {
+static void compute_stats_opt_c(int wiener_win, const uint8_t *dgd,
+                                const uint8_t *src, int16_t *d, int16_t *s,
+                                int h_start, int h_end, int v_start, int v_end,
+                                int dgd_stride, int src_stride, int64_t *M,
+                                int64_t *H, int use_downsampled_wiener_stats) {
   if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_win_opt_c(wiener_win, dgd, src, d, s, h_start, h_end, v_start,
                             v_end, dgd_stride, src_stride, M, H,
@@ -519,11 +519,12 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
   }
 }
 
-void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd,
-                                const uint8_t *src, int16_t *d, int16_t *s,
-                                int h_start, int h_end, int v_start, int v_end,
-                                int dgd_stride, int src_stride, int64_t *M,
-                                int64_t *H, aom_bit_depth_t bit_depth) {
+static void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd,
+                                       const uint8_t *src, int16_t *d,
+                                       int16_t *s, int h_start, int h_end,
+                                       int v_start, int v_end, int dgd_stride,
+                                       int src_stride, int64_t *M, int64_t *H,
+                                       aom_bit_depth_t bit_depth) {
   if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end,
                                    v_start, v_end, dgd_stride, src_stride, M, H,
-- 
GitLab


From 7fc3fa6899600f2356c471632dc464992f965d78 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 28 May 2024 15:58:26 -0700
Subject: [PATCH 165/391] av1_block_error_lp_sve: fix block_size param type

int -> intptr_t. This fixes a Control Flow Integrity (CFI) sanitizer
failure.

This also fixes a -Wmissing-prototypes warning. The _neon version was
fixed in:
b44333201b *_neon.c: add missing rtcd includes & CONFIG check

Bug: aomedia:3416
Change-Id: Iecf4a0f450435d6afa481695e000bcc0c8f4079c
---
 av1/encoder/arm/av1_error_sve.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/arm/av1_error_sve.c b/av1/encoder/arm/av1_error_sve.c
index 52803a9838..5a1ad2f48e 100644
--- a/av1/encoder/arm/av1_error_sve.c
+++ b/av1/encoder/arm/av1_error_sve.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
@@ -49,7 +50,7 @@ int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 }
 
 int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
-                               int block_size) {
+                               intptr_t block_size) {
   if (block_size % 32 == 0) {
     int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
                            vdupq_n_s64(0) };
-- 
GitLab


From ddb29c194cc24baf70e497eda74f9d1b1488cc4f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 28 May 2024 16:33:45 -0700
Subject: [PATCH 166/391] highbd_convolve_sve2.c: make some functions static

Fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Ic4fe46f362df66de66cbd38a3f630ce7ebf6d141
---
 av1/common/arm/highbd_convolve_sve2.c | 38 +++++++++++++--------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index e6e27719b4..6ce9f36d9a 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -562,10 +562,11 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
   return vminq_u16(res, max);
 }
 
-void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
-                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                    int width, int height,
-                                    const int16_t *filter_y, int bd) {
+static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
+                                           ptrdiff_t src_stride, uint16_t *dst,
+                                           ptrdiff_t dst_stride, int width,
+                                           int height, const int16_t *filter_y,
+                                           int bd) {
   assert(width >= 4 && height >= 4);
 
   const int16x8_t y_filter = vld1q_s16(filter_y);
@@ -731,10 +732,11 @@ static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
   return vminq_u16(res, max);
 }
 
-void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
-                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                    int width, int height,
-                                    const int16_t *filter_y, int bd) {
+static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
+                                           ptrdiff_t src_stride, uint16_t *dst,
+                                           ptrdiff_t dst_stride, int width,
+                                           int height, const int16_t *filter_y,
+                                           int bd) {
   assert(width >= 4 && height >= 4);
 
   const int16x8_t y_filter =
@@ -1346,12 +1348,10 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
   return vminq_u16(res, max);
 }
 
-void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
-                                          ptrdiff_t src_stride, uint16_t *dst,
-                                          ptrdiff_t dst_stride, int width,
-                                          int height, const int16_t *filter_y,
-                                          ConvolveParams *conv_params, int bd,
-                                          const int y_offset) {
+static void highbd_convolve_2d_sr_vert_8tap_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
+    ConvolveParams *conv_params, int bd, const int y_offset) {
   assert(width >= 4 && height >= 4);
   const int64x2_t offset = vdupq_n_s64(y_offset);
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
@@ -1536,12 +1536,10 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
   return vminq_u16(res, max);
 }
 
-void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
-                                          ptrdiff_t src_stride, uint16_t *dst,
-                                          ptrdiff_t dst_stride, int width,
-                                          int height, const int16_t *filter_y,
-                                          ConvolveParams *conv_params, int bd,
-                                          const int y_offset) {
+static void highbd_convolve_2d_sr_vert_4tap_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
+    ConvolveParams *conv_params, int bd, const int y_offset) {
   assert(width >= 4 && height >= 4);
   const int64x2_t offset = vdupq_n_s64(y_offset);
   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
-- 
GitLab


From 666d9c31d92587590ed29e3f3a7ff76ef853e9e0 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 29 May 2024 13:52:53 -0700
Subject: [PATCH 167/391] av1/encoder/nonrd_opt.c: Include config/av1_rtcd.h

av1/encoder/nonrd_opt.c calls av1_block_error_lp(), which is declared in
config/av1_rtcd.h.

Change-Id: I901e4041b5c49866fd683667a4a66520d63bae12
---
 av1/encoder/nonrd_opt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/av1/encoder/nonrd_opt.c b/av1/encoder/nonrd_opt.c
index 651ca43a2e..e3589dad6b 100644
--- a/av1/encoder/nonrd_opt.c
+++ b/av1/encoder/nonrd_opt.c
@@ -10,6 +10,7 @@
  */
 
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/reconinter.h"
 
-- 
GitLab


From a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 29 May 2024 13:59:53 -0700
Subject: [PATCH 168/391] error_intrin_avx2.c:Change num_coeff to block_size

The last parameter of av1_block_error_lp_avx2() should be renamed
`block_size` to match the name of the parameter in the declaration.
Change all occurrences of "num_coeff" to "block_size" for consistency.

Change-Id: I4a6bf42969a98e0bb29e7d72970c1038e8e86c99
---
 av1/encoder/x86/error_intrin_avx2.c | 36 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 57725d1795..f180c94f4e 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -29,9 +29,9 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
   }
 }
 
-static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
-                                                    const int16_t *dqcoeff,
-                                                    __m256i *sse_256) {
+static INLINE void av1_block_error_block_size16_avx2(const int16_t *coeff,
+                                                     const int16_t *dqcoeff,
+                                                     __m256i *sse_256) {
   const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
   const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
   // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
@@ -44,9 +44,9 @@ static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
   *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
 }
 
-static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
-                                                    const int16_t *dqcoeff,
-                                                    __m256i *sse_256) {
+static INLINE void av1_block_error_block_size32_avx2(const int16_t *coeff,
+                                                     const int16_t *dqcoeff,
+                                                     __m256i *sse_256) {
   const __m256i zero = _mm256_setzero_si256();
   const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
   const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
@@ -71,12 +71,12 @@ static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
   *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
 }
 
-static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
-                                                    const int16_t *dqcoeff,
-                                                    __m256i *sse_256,
-                                                    intptr_t num_coeff) {
+static INLINE void av1_block_error_block_size64_avx2(const int16_t *coeff,
+                                                     const int16_t *dqcoeff,
+                                                     __m256i *sse_256,
+                                                     intptr_t block_size) {
   const __m256i zero = _mm256_setzero_si256();
-  for (int i = 0; i < num_coeff; i += 64) {
+  for (int i = 0; i < block_size; i += 64) {
     // Load 64 elements for coeff and dqcoeff.
     const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
     const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
@@ -126,17 +126,17 @@ static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
 }
 
 int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
-                                intptr_t num_coeff) {
-  assert(num_coeff % 16 == 0);
+                                intptr_t block_size) {
+  assert(block_size % 16 == 0);
   __m256i sse_256 = _mm256_setzero_si256();
   int64_t sse;
 
-  if (num_coeff == 16)
-    av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256);
-  else if (num_coeff == 32)
-    av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256);
+  if (block_size == 16)
+    av1_block_error_block_size16_avx2(coeff, dqcoeff, &sse_256);
+  else if (block_size == 32)
+    av1_block_error_block_size32_avx2(coeff, dqcoeff, &sse_256);
   else
-    av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff);
+    av1_block_error_block_size64_avx2(coeff, dqcoeff, &sse_256, block_size);
 
   // Save the higher 64 bit of each 128 bit lane.
   const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
-- 
GitLab


From 2481a3508ab04e9e5d4dc5de11817bc25a151996 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 29 May 2024 20:46:04 -0700
Subject: [PATCH 169/391] convolve_2d_sr_vert_12tap_sve2: fix assert

x_filter_0_7 -> y_filter_0_7; the former is not defined in this scope

Change-Id: Iffaf0ac42d11df37326bb77f23f1d3580b8c301b
---
 av1/common/arm/convolve_sve2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
index 136abae43c..a274730548 100644
--- a/av1/common/arm/convolve_sve2.c
+++ b/av1/common/arm/convolve_sve2.c
@@ -47,7 +47,7 @@ static INLINE void convolve_2d_sr_vert_12tap_sve2(
     const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
     const int16x8_t y_filter_4_11) {
   // The no-op filter should never be used here.
-  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+  assert(vgetq_lane_s16(y_filter_0_7, 5) != 128);
 
   const int bd = 8;
   const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
-- 
GitLab


From d372aaf0b0605b3f6acaa62d552b4dd41aa9b06a Mon Sep 17 00:00:00 2001
From: Mudassir Galaganath <mudassir.galaganath@ittiam.com>
Date: Fri, 17 May 2024 19:06:53 +0530
Subject: [PATCH 170/391] Fix memory over-read issue in av1_resize_horz_dir()
 SIMD

This CL fixes the test failures under 32-bit valgrind due
to memory over-read issue reported in Bug: aomedia:3575.
To fix this issue pixel overloading at frame boundary is
avoided. Also av1_resize_horz_dir_sse2() is enabled.

Bug: aomedia:3575
Change-Id: I50d87adb033c7e2cab036d66d49c11b5b81469ca
---
 av1/common/av1_rtcd_defs.pl  |   4 +-
 av1/common/x86/resize_avx2.c | 104 ++++++++++++++++++++++-------------
 av1/common/x86/resize_sse2.c |  29 ++++++----
 test/frame_resize_test.cc    |  92 +++++++++++++++++--------------
 4 files changed, 137 insertions(+), 92 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7c4f539ee6..7fa8b4e6d0 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,9 +558,7 @@ add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int o
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
-# passes under 32-bit valgrind.
-specialize qw/av1_resize_horz_dir avx2/;
+specialize qw/av1_resize_horz_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 425c9f44e1..9c8958ed42 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -17,6 +17,7 @@
 
 #include "aom_dsp/x86/synonyms.h"
 
+#define ROW_OFFSET 5
 #define CAST_HI(x) _mm256_castsi128_si256(x)
 #define CAST_LOW(x) _mm256_castsi256_si128(x)
 
@@ -122,7 +123,7 @@
   filter_offset = 3;                                                           \
                                                                                \
   /* Pad start pixels to the left, while processing the first pixels in the    \
-    row. */                                                                    \
+   * row. */                                                                   \
   if (j == 0) {                                                                \
     /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
     row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
@@ -131,21 +132,24 @@
     r0 = row0;                                                                 \
     r1 = row1;                                                                 \
   }                                                                            \
-                                                                               \
+  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+  /* Avoid loading extra pixels at frame boundary.*/                           \
+  if (is_last_cols32) row_offset = ROW_OFFSET;                                 \
   /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
   __m128i row0_0 = _mm_loadl_epi64(                                            \
-      (__m128i *)&input[i * in_stride + 32 + j - filter_offset]);              \
+      (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
   /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
-  __m128i row1_0 = _mm_loadl_epi64(                                            \
-      (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]);        \
+  __m128i row1_0 =                                                             \
+      _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j -         \
+                                        filter_offset - row_offset]);          \
   __m256i r2 = _mm256_permute2x128_si256(                                      \
       _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
                                                                                \
   /* Pad end pixels to the right, while processing the last pixels in the      \
-  row. */                                                                      \
-  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+   * row. */                                                                   \
   if (is_last_cols32) {                                                        \
-    r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask);                           \
+    r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET),                \
+                             wd32_end_pad_mask);                               \
   }                                                                            \
                                                                                \
   /* Process even pixels of the first row  */                                  \
@@ -169,7 +173,8 @@
   s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
                                                                                \
   /* The register res_out_0 stores the result of start-16 pixels corresponding \
-to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
+   * to the first and second rows whereas res_out_1 stores the end-16          \
+   * pixels. */                                                                \
   __m256i res_out_0[2], res_out_1[2];                                          \
   res_out_1[0] = res_out_1[1] = zero;                                          \
   res_out_0[0] = res_out_0[1] = zero;                                          \
@@ -184,7 +189,7 @@ to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
   /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
   __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
                                                                                \
-  /* result of 32 pixels of row1 (b0 to b32) */                                \
+  /* Result of 32 pixels of row1 (b0 to b32) */                                \
   res_out_0[1] = _mm256_sra_epi32(                                             \
       _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
   res_out_1[1] = _mm256_sra_epi32(                                             \
@@ -530,12 +535,10 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
                               uint8_t *intbuf, int height, int filtered_length,
                               int width2) {
   assert(height % 2 == 0);
-  // Invoke C for width less than 32.
-  // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
-  // passes under 32-bit valgrind.
+  // Invoke SSE2 for width less than 32.
   if (filtered_length < 32) {
-    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
-                          width2);
+    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
+                             width2);
     return;
   }
 
@@ -569,6 +572,7 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
   if (filtered_length % 32 == 0) {
     for (int i = 0; i < height; i += 2) {
       int filter_offset = 0;
+      int row_offset = 0;
       for (int j = 0; j < filtered_length; j += 32) {
         PROCESS_RESIZE_X_WD32
       }
@@ -576,28 +580,50 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
   } else {
     for (int i = 0; i < height; i += 2) {
       int filter_offset = 0;
-      int remain_col = filtered_length % 32;
-      for (int j = 0; j + 32 <= filtered_length; j += 32) {
+      int remain_col = filtered_length;
+      int row_offset = 0;
+      // To avoid pixel over-read at frame boundary, processing of 32 pixels
+      // is done using the core loop only if sufficient number of pixels
+      // required for the load are present. The remaining pixels are processed
+      // separately.
+      for (int j = 0; j <= filtered_length - 32; j += 32) {
+        if (remain_col == 34 || remain_col == 36) {
+          break;
+        }
         PROCESS_RESIZE_X_WD32
+        remain_col -= 32;
       }
 
       int wd_processed = filtered_length - remain_col;
-      if (remain_col > 15) {
-        remain_col = filtered_length % 16;
-        const int in_idx = i * in_stride + wd_processed - filter_offset;
+      // To avoid pixel over-read at frame boundary, processing of 16 pixels
+      // is done only if sufficient number of pixels required for the
+      // load are present. The remaining pixels are processed separately.
+      if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
+        remain_col = filtered_length - wd_processed - 16;
+        const int in_idx = i * in_stride + wd_processed;
         const int out_idx = (i * dst_stride) + wd_processed / 2;
         // a0 a1 --- a15
-        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        __m128i row0 =
+            _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
         // b0 b1 --- b15
-        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        __m128i row1 = _mm_loadu_si128(
+            (__m128i *)&input[in_idx + in_stride - filter_offset]);
         // a0 a1 --- a15 || b0 b1 --- b15
         __m256i r0 =
             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+        if (filter_offset == 0) {
+          r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
+        }
+        filter_offset = 3;
+        const int is_last_cols16 = wd_processed + 16 == filtered_length;
+        if (is_last_cols16) row_offset = ROW_OFFSET;
 
         // a16 a17 --- a23
-        row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+        row0 = _mm_loadl_epi64(
+            (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
         // b16 b17 --- b23
-        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
+                                                 row_offset - filter_offset]);
 
         // a16-a23 x x x x| b16-b23 x x x x
         __m256i r1 =
@@ -605,9 +631,9 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
 
         // Pad end pixels to the right, while processing the last pixels in the
         // row.
-        const int is_last_cols16 = wd_processed + 16 == filtered_length;
         if (is_last_cols16) {
-          r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+          r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
+                                   wd32_end_pad_mask);
         }
 
         // a0 a1 --- a15 || b0 b1 --- b15
@@ -624,7 +650,7 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
         res_out_0[0] = res_out_0[1] = zero;
         resize_convolve(s0, coeffs_x, res_out_0);
 
-        // r00 -r07
+        // r00-r07
         res_out_0[0] = _mm256_sra_epi32(
             _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
         // r10-r17
@@ -647,23 +673,30 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
                          _mm_unpackhi_epi64(low_result, low_result));
       }
 
+      // To avoid pixel over-read at frame boundary, processing of 8 pixels
+      // is done only if sufficient number of pixels required for the
+      // load are present. The remaining pixels are processed by C function.
       wd_processed = filtered_length - remain_col;
-      if (remain_col > 7) {
-        remain_col = filtered_length % 8;
+      if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
+        remain_col = filtered_length - wd_processed - 8;
         const int in_idx = i * in_stride + wd_processed - filter_offset;
         const int out_idx = (i * dst_stride) + wd_processed / 2;
+        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+        if (is_last_cols_8) row_offset = ROW_OFFSET;
         // a0 a1 --- a15
-        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
         // b0 b1 --- b15
-        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        __m128i row1 =
+            _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
         // a0 a1 --- a15 || b0 b1 --- b15
         __m256i r0 =
             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
 
         // Pad end pixels to the right, while processing the last pixels in the
         // row.
-        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
-        if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+        if (is_last_cols_8)
+          r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
+                                   wd8_end_pad_mask);
 
         // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
         s0[0] = r0;
@@ -673,6 +706,7 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
         s0[2] = _mm256_bsrli_epi128(r0, 4);
         // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
         s0[3] = _mm256_bsrli_epi128(r0, 6);
+
         __m256i res_out_0[2];
         res_out_0[0] = res_out_0[1] = zero;
         resize_convolve(s0, coeffs_x, res_out_0);
@@ -696,10 +730,6 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
       }
 
       wd_processed = filtered_length - remain_col;
-      // When the remaining width is 2, the above code would not have taken
-      // care of padding required for (filtered_length - 4)th pixel. Hence,
-      // process that pixel again with the C code.
-      wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
       if (remain_col) {
         const int in_idx = (in_stride * i);
         const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 6b34cebfe0..e2d84daaf4 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -16,6 +16,8 @@
 
 #include "aom_dsp/x86/synonyms.h"
 
+#define ROW_OFFSET 5
+
 #define PROCESS_RESIZE_Y_WD8                                           \
   /* ah0 ah1 ... ah7 */                                                \
   const __m128i AH = _mm_add_epi16(l0, l7);                            \
@@ -200,7 +202,6 @@ void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
   __m128i coeffs_x[2];
   const int bits = FILTER_BITS;
   const int dst_stride = width2;
-  const int remain_col = filtered_length % 16;
   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
 
@@ -215,15 +216,27 @@ void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
 
   for (int i = 0; i < height; ++i) {
     int filter_offset = 0;
+    int row01_offset = ROW_OFFSET;
+    int remain_col = filtered_length;
+    // To avoid pixel over-read at frame boundary, processing of 16 pixels
+    // is done using the core loop only if sufficient number of pixels required
+    // for the load are present.The remaining pixels are processed separately.
     for (int j = 0; j <= filtered_length - 16; j += 16) {
+      if (remain_col == 18 || remain_col == 20) {
+        break;
+      }
+      const int is_last_cols16 = (j == filtered_length - 16);
+      // While processing the last 16 pixels of the row, ensure that only valid
+      // pixels are loaded.
+      if (is_last_cols16) row01_offset = 0;
       const int in_idx = i * in_stride + j - filter_offset;
       const int out_idx = i * dst_stride + j / 2;
-
+      remain_col -= 16;
       // a0 a1 a2 a3 .... a15
       __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
       // a8 a9 a10 a11 .... a23
-      __m128i row01 =
-          _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+      __m128i row01 = _mm_loadu_si128(
+          (__m128i *)&input[in_idx + row01_offset + filter_offset]);
       filter_offset = 3;
 
       // Pad start pixels to the left, while processing the first pixels in the
@@ -237,11 +250,11 @@ void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
 
       // Pad end pixels to the right, while processing the last pixels in the
       // row.
-      const int is_last_cols16 = (j == filtered_length - 16);
       if (is_last_cols16) {
         const __m128i end_pixel_row0 =
             _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
-        row01 = blend(row01, end_pixel_row0, end_pad_mask);
+        row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0,
+                      end_pad_mask);
       }
 
       // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
@@ -318,10 +331,6 @@ void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
     }
 
     int wd_processed = filtered_length - remain_col;
-    // When the remaining width is 2, the above code would not have taken
-    // care of padding required for (filtered_length - 4)th pixel. Hence,
-    // process that pixel again with the C code.
-    wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
     if (remain_col) {
       const int in_idx = (in_stride * i);
       const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 83e56edefb..9145803891 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <memory>
+#include <new>
+
 #include "config/av1_rtcd.h"
 #include "test/acm_random.h"
 #include "test/util.h"
@@ -63,12 +66,18 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
     height_ = std::get<1>(frame_dim_);
     const int msb = get_msb(AOMMIN(width_, height_));
     n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+    const int src_buf_size = (width_ / 2) * height_;
+    const int dest_buf_size = (width_ * height_) / 4;
+    src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+    ASSERT_NE(src_, nullptr);
 
-    src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_));
     ref_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(ref_dest_, nullptr);
+
     test_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(test_dest_, nullptr);
   }
 
   void RunTest() {
@@ -76,11 +85,12 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
     for (int level = 1; level < n_levels_; level++) {
       const int width2 = (width_ >> level);
       const int height2 = (height_ >> level);
-      av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
-                            width2, 0);
-      test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+      av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+                            height2, width2, 0);
+      test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+                width2, 0);
 
-      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
+      AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height2);
     }
   }
 
@@ -92,8 +102,8 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
       aom_usec_timer ref_timer;
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < kIters; j++) {
-        av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
-                              width2, 0);
+        av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+                              height2, width2, 0);
       }
       aom_usec_timer_mark(&ref_timer);
       const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -101,7 +111,8 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
       aom_usec_timer tst_timer;
       aom_usec_timer_start(&tst_timer);
       for (int j = 0; j < kIters; j++) {
-        test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+        test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+                  width2, 0);
       }
       aom_usec_timer_mark(&tst_timer);
       const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -112,21 +123,15 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
     }
   }
 
-  void TearDown() {
-    aom_free(src_);
-    aom_free(ref_dest_);
-    aom_free(test_dest_);
-  }
-
  private:
   LowBDResizeFunc test_fun_;
   FrameDimension frame_dim_;
   int width_;
   int height_;
   int n_levels_;
-  uint8_t *src_;
-  uint8_t *ref_dest_;
-  uint8_t *test_dest_;
+  std::unique_ptr<uint8_t[]> src_;
+  std::unique_ptr<uint8_t[]> ref_dest_;
+  std::unique_ptr<uint8_t[]> test_dest_;
   libaom_test::ACMRandom rng_;
 };
 
@@ -141,7 +146,9 @@ TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); }
 const FrameDimension kFrameDim[] = {
   make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
   make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
-  make_tuple(256, 256),
+  make_tuple(286, 286),   make_tuple(284, 284),   make_tuple(282, 282),
+  make_tuple(280, 280),   make_tuple(262, 262),   make_tuple(258, 258),
+  make_tuple(256, 256),   make_tuple(34, 34),
 };
 #endif
 
@@ -174,11 +181,18 @@ class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
     height_ = std::get<1>(frame_dim_);
     const int msb = get_msb(AOMMIN(width_, height_));
     n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
-    src_ = (uint8_t *)aom_malloc(width_ * height_ * sizeof(*src_));
+    const int src_buf_size = width_ * height_;
+    const int dest_buf_size = (width_ * height_) / 2;
+    src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+    ASSERT_NE(src_, nullptr);
+
     ref_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*ref_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(ref_dest_, nullptr);
+
     test_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*test_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(test_dest_, nullptr);
   }
 
   void RunTest() {
@@ -186,10 +200,11 @@ class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
 
     for (int level = 1; level < n_levels_; ++level) {
       const int width2 = (width_ >> level);
-      av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
-                            width2);
-      test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
-      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height_);
+      av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+                            width2 << 1, width2);
+      test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+                width2);
+      AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height_);
     }
   }
 
@@ -201,8 +216,8 @@ class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
       aom_usec_timer ref_timer;
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < kIters; ++j) {
-        av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
-                              width2);
+        av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+                              width2 << 1, width2);
       }
       aom_usec_timer_mark(&ref_timer);
       const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -210,7 +225,8 @@ class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
       aom_usec_timer tst_timer;
       aom_usec_timer_start(&tst_timer);
       for (int j = 0; j < kIters; ++j) {
-        test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
+        test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+                  width2);
       }
       aom_usec_timer_mark(&tst_timer);
       const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -221,21 +237,15 @@ class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> {
     }
   }
 
-  void TearDown() {
-    aom_free(src_);
-    aom_free(ref_dest_);
-    aom_free(test_dest_);
-  }
-
  private:
   LowBDResize_x_Func test_fun_;
   FrameDimension frame_dim_;
   int width_;
   int height_;
   int n_levels_;
-  uint8_t *src_;
-  uint8_t *ref_dest_;
-  uint8_t *test_dest_;
+  std::unique_ptr<uint8_t[]> src_;
+  std::unique_ptr<uint8_t[]> ref_dest_;
+  std::unique_ptr<uint8_t[]> test_dest_;
   libaom_test::ACMRandom rng_;
 };
 
@@ -245,9 +255,7 @@ TEST_P(AV1ResizeXTest, RunTest) { RunTest(); }
 
 TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
 
-// TODO(https://crbug.com/aomedia/3575): Reenable this after test passes under
-// 32-bit valgrind.
-#if 0  // HAVE_SSE2
+#if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1ResizeXTest,
     ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),
-- 
GitLab


From 9ad7c3c84ea2cc4e0ed6696903e560c62455ffa0 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 29 May 2024 12:27:22 -0700
Subject: [PATCH 171/391] rtc: Remove logic to set sb_size for dynamic mode

For real-time and screen mode: when the --sb-size=dynamic
is used (which is default), the sb_size was being set to 64
under certain conditions (based on resolution and #threads).
This selection of 64 size is causing the regression (compared
to 1 thread which is using 128), as shown in bug attached below.

Remove this internal logic for sb_size switch in dynamic mode
for now. Will follow-up to investigate how 64 vs 128 caused
such significant regression in those clips.

Bug: b/343429036

Change-Id: Id118f587c782d5e21ccaa7cb73355b86bb0267bb
---
 av1/encoder/encoder_utils.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 1f81a530c9..1c04df7e0c 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -837,21 +837,7 @@ BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
                ? BLOCK_128X128
                : BLOCK_64X64;
   } else if (oxcf->mode == REALTIME) {
-    if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
-      const TileConfig *const tile_cfg = &oxcf->tile_cfg;
-      const int num_tiles =
-          (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows);
-      // For multi-thread encode: if the number of (128x128) superblocks
-      // per tile is low use 64X64 superblock.
-      if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 &&
-          oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 &&
-          (width * height) / (128 * 128 * num_tiles) <= 38)
-        return BLOCK_64X64;
-      else
-        return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
-    } else {
-      return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
-    }
+    return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
   }
 
   // TODO(any): Possibly could improve this with a heuristic.
-- 
GitLab


From 0c7d0816892faaf6cc1fc07fdf2c5cdf56892ad5 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 28 May 2024 15:04:08 -0700
Subject: [PATCH 172/391] Build aom_av1_rc when BUILD_SHARED_LIBS is ON

Based on the requirements given in
https://aomedia-review.googlesource.com/c/aom/+/188606: to build the
aom_av1_rc library in a shared fashion, libaom_av1_rc.so.

Change-Id: Ic242113fe87511fe6660e52da8313d42b8fba7d3
---
 CMakeLists.txt  | 37 +++++++++++++++++++++++++------------
 test/test.cmake |  1 -
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2e2e9f783..2a9c5d18a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -323,11 +323,28 @@ if(NOT WIN32 AND NOT APPLE)
   endif()
 endif()
 
-if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
               "${AOM_ROOT}/av1/ratectrl_rtc.cc")
   add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
-  target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
+  # aom_av1_rc calls libaom's internal functions, so it must be linked with the
+  # libaom static library.
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
+  else()
+    target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
+  endif()
+  if(BUILD_SHARED_LIBS)
+    # On Windows, global symbols are not exported from a DLL by default. Enable
+    # the WINDOWS_EXPORT_ALL_SYMBOLS property to export all global symbols from
+    # the aom_av1_rc DLL on Windows, to match the default behavior on other
+    # platforms.
+    set_target_properties(aom_av1_rc PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    # The aom_av1_rc library and its header "av1/ratectrl_rtc.h" are not
+    # installed by the "install" command, so we don't need to worry about
+    # versioning the aom_av1_rc shared library. If we start to install the
+    # aom_av1_rc library, the library should be versioned.
+  endif()
   if(NOT WIN32 AND NOT APPLE)
     target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m)
   endif()
@@ -336,7 +353,7 @@ endif()
 
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
-if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER)
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
 endif()
 if(BUILD_SHARED_LIBS)
@@ -487,19 +504,15 @@ if(CONFIG_AV1_ENCODER)
     add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
                                     $<TARGET_OBJECTS:aom_common_app_util>
                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
 
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
                 photon_noise_table set_maps simple_encoder scalable_encoder
-                twopass_encoder)
-
-    if(NOT BUILD_SHARED_LIBS)
-      add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
-                                     $<TARGET_OBJECTS:aom_common_app_util>
-                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
-      target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
-      list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
-    endif()
+                svc_encoder_rtc twopass_encoder)
   endif()
 
   if(ENABLE_TOOLS)
diff --git a/test/test.cmake b/test/test.cmake
index da144683ae..02e85f82fc 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -635,7 +635,6 @@ function(setup_aom_test_targets)
   if(CONFIG_AV1_ENCODER
      AND ENABLE_TESTS
      AND CONFIG_WEBM_IO
-     AND NOT BUILD_SHARED_LIBS
      AND NOT CONFIG_REALTIME_ONLY)
     add_executable(test_aom_rc ${AOM_RC_TEST_SOURCES})
     target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom_av1_rc aom_gtest)
-- 
GitLab


From d9c15a14966daac458f270227c0ea6a9fa486f43 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Sat, 18 May 2024 19:57:03 +0200
Subject: [PATCH 173/391] Add Neon DotProd implementation for
 av1_convolve_2d_scale

Add an Armv8.4 DotProd implementation for the horizontal filtering
part of av1_convolve_2d_scale.

The vertical pass operates on 16-bit types so the Armv8.0
implementation must still be used. Move functions associated with the
vertical pass into a header file so they can be shared by both the
Armv8.0 and Armv8.4 DotProd paths.

Add the corresponding tests as well.

Change-Id: I4865953a1aff7bde9b786d65a140da2501894cc7
---
 av1/av1.cmake                                 |   1 +
 av1/common/arm/av1_convolve_scale_neon.c      | 458 +----------------
 .../arm/av1_convolve_scale_neon_dotprod.c     | 232 +++++++++
 av1/common/arm/convolve_scale_neon.h          | 480 ++++++++++++++++++
 av1/common/av1_rtcd_defs.pl                   |   2 +-
 test/av1_convolve_scale_test.cc               |   7 +
 6 files changed, 722 insertions(+), 458 deletions(-)
 create mode 100644 av1/common/arm/av1_convolve_scale_neon_dotprod.c
 create mode 100644 av1/common/arm/convolve_scale_neon.h

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 232e00ff27..f67778a202 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -404,6 +404,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_dotprod.c"
             "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c")
 
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index b8d934e796..d9c2967d36 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -17,463 +17,7 @@
 
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
-
-static INLINE int16x4_t compound_convolve8_4_v(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
-    const int32x4_t offset_const) {
-  const int16x4_t filter_0_3 = vget_low_s16(filter);
-  const int16x4_t filter_4_7 = vget_high_s16(filter);
-
-  int32x4_t sum = offset_const;
-  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
-  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
-  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
-
-  return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE int16x8_t compound_convolve8_8_v(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
-    const int32x4_t offset_const) {
-  const int16x4_t filter_0_3 = vget_low_s16(filter);
-  const int16x4_t filter_4_7 = vget_high_s16(filter);
-
-  int32x4_t sum0 = offset_const;
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
-
-  int32x4_t sum1 = offset_const;
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
-
-  int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
-  int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
-
-  return vcombine_s16(res0, res1);
-}
-
-static INLINE void compound_convolve_vert_scale_neon(
-    const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
-  // non-rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs.
-  const int32x4_t vert_offset =
-      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
-
-  int y_qn = subpel_y_qn;
-
-  if (w == 4) {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                            filter, vert_offset);
-
-      vst1_u16(dst, vreinterpret_u16_s16(d0));
-
-      dst += dst_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  } else {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int width = w;
-      uint16_t *d = dst;
-
-      do {
-        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                              filter, vert_offset);
-
-        vst1q_u16(d, vreinterpretq_u16_s16(d0));
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-
-      dst += dst_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  }
-}
-
-static INLINE void compound_avg_convolve_vert_scale_neon(
-    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
-    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
-    int subpel_y_qn, int y_step_qn) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
-  // non-rounding shifts - which are generally faster than rounding shifts
-  // on modern CPUs.
-  const int32_t vert_offset_bits =
-      (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
-  // For the averaging code path substract round offset and convolve round.
-  const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
-  const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
-
-  int y_qn = subpel_y_qn;
-
-  if (w == 4) {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                            filter, vert_offset);
-
-      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
-
-      int16x4_t avg = vhadd_s16(dd0, d0);
-      int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
-
-      uint8x8_t d0_u8 = vqrshrun_n_s16(
-          d0_s16, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
-
-      store_u8_4x1(dst8, d0_u8);
-
-      dst16 += dst16_stride;
-      dst8 += dst8_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  } else {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int width = w;
-      uint8_t *dst8_ptr = dst8;
-      uint16_t *dst16_ptr = dst16;
-
-      do {
-        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                              filter, vert_offset);
-
-        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
-
-        int16x8_t avg = vhaddq_s16(dd0, d0);
-
-        uint8x8_t d0_u8 = vqrshrun_n_s16(
-            avg, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
-
-        vst1_u8(dst8_ptr, d0_u8);
-
-        s += 8;
-        dst8_ptr += 8;
-        dst16_ptr += 8;
-        width -= 8;
-      } while (width != 0);
-
-      dst16 += dst16_stride;
-      dst8 += dst8_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  }
-}
-
-static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
-    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
-    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
-    ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  int y_qn = subpel_y_qn;
-  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
-  // non-rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs.
-  const int32x4_t vert_offset =
-      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
-  // For the weighted averaging code path we have to substract round offset and
-  // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
-  // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
-  // additional shift by DIST_PRECISION_BITS is needed in order to merge two
-  // shift calculations into one.
-  const int32x4_t dist_wtd_offset = vdupq_n_s32(
-      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
-             DIST_PRECISION_BITS)) -
-      (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
-      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
-  const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
-  const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
-
-  if (w == 4) {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                            filter, vert_offset);
-
-      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
-
-      int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
-      dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
-
-      int16x4_t d0_s16 = vshrn_n_s32(
-          dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
-                           DIST_PRECISION_BITS);
-
-      uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
-
-      store_u8_4x1(dst8, d0_u8);
-
-      dst16 += dst16_stride;
-      dst8 += dst8_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  } else {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int width = w;
-      uint8_t *dst8_ptr = dst8;
-      uint16_t *dst16_ptr = dst16;
-
-      do {
-        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
-                                              filter, vert_offset);
-
-        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
-
-        int32x4_t dst_wtd_avg0 =
-            vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
-        int32x4_t dst_wtd_avg1 =
-            vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
-
-        dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
-        dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
-
-        int16x4_t d0_s16_0 = vshrn_n_s32(
-            dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
-                              DIST_PRECISION_BITS);
-        int16x4_t d0_s16_1 = vshrn_n_s32(
-            dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
-                              DIST_PRECISION_BITS);
-
-        uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
-
-        vst1_u8(dst8_ptr, d0_u8);
-
-        s += 8;
-        dst8_ptr += 8;
-        dst16_ptr += 8;
-        width -= 8;
-      } while (width != 0);
-
-      dst16 += dst16_stride;
-      dst8 += dst8_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  }
-}
-
-static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
-                                      const int16x4_t s2, const int16x4_t s3,
-                                      const int16x4_t s4, const int16x4_t s5,
-                                      const int16x4_t s6, const int16x4_t s7,
-                                      const int16x8_t filter,
-                                      const int32x4_t offset_const) {
-  const int16x4_t filter_0_3 = vget_low_s16(filter);
-  const int16x4_t filter_4_7 = vget_high_s16(filter);
-
-  int32x4_t sum = offset_const;
-  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
-  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
-  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
-
-  int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
-
-  return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
-}
-
-static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
-                                      const int16x8_t s2, const int16x8_t s3,
-                                      const int16x8_t s4, const int16x8_t s5,
-                                      const int16x8_t s6, const int16x8_t s7,
-                                      const int16x8_t filter,
-                                      const int32x4_t offset_const) {
-  const int16x4_t filter_0_3 = vget_low_s16(filter);
-  const int16x4_t filter_4_7 = vget_high_s16(filter);
-
-  int32x4_t sum0 = offset_const;
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
-
-  int32x4_t sum1 = offset_const;
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
-
-  int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
-  int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
-
-  return vqmovun_s16(vcombine_s16(res0, res1));
-}
-
-static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
-                                            uint8_t *dst, int dst_stride, int w,
-                                            int h, const int16_t *y_filter,
-                                            int subpel_y_qn, int y_step_qn) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
-  // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
-  int32x4_t vert_offset =
-      vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
-
-  int y_qn = subpel_y_qn;
-  if (w == 4) {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      uint8x8_t d =
-          convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
-
-      store_u8_4x1(dst, d);
-
-      dst += dst_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  } else if (w == 8) {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      uint8x8_t d =
-          convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
-
-      vst1_u8(dst, d);
-
-      dst += dst_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  } else {
-    do {
-      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      uint8_t *d = dst;
-      int width = w;
-
-      const ptrdiff_t filter_offset =
-          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
-      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
-
-      do {
-        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
-        load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
-                     &s5[0], &s6[0], &s7[0]);
-        load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
-                     &s5[1], &s6[1], &s7[1]);
-
-        uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
-                                     s6[0], s7[0], filter, vert_offset);
-        uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
-                                     s6[1], s7[1], filter, vert_offset);
-
-        vst1q_u8(d, vcombine_u8(d0, d1));
-
-        s += 16;
-        d += 16;
-        width -= 16;
-      } while (width != 0);
-
-      dst += dst_stride;
-      y_qn += y_step_qn;
-    } while (--h != 0);
-  }
-}
+#include "av1/common/arm/convolve_scale_neon.h"
 
 static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
new file mode 100644
index 0000000000..619efa65ed
--- /dev/null
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/arm/convolve_scale_neon.h"
+
+static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
+                                      const uint8x8_t s2, const uint8x8_t s3,
+                                      const int8x8_t filter,
+                                      const int32x4_t horiz_const) {
+  const int8x16_t filters = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128)));
+  int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128)));
+
+  int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters);
+  int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters);
+
+  int32x4_t sum = vpaddq_s32(sum01, sum23);
+
+  // We halved the filter values so -1 from right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
+                                      const uint8x8_t s2, const uint8x8_t s3,
+                                      const uint8x8_t s4, const uint8x8_t s5,
+                                      const uint8x8_t s6, const uint8x8_t s7,
+                                      const int8x8_t filter,
+                                      const int32x4_t horiz_const) {
+  const int8x16_t filters = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+  uint8x16_t s45 = vcombine_u8(s4, s5);
+  uint8x16_t s67 = vcombine_u8(s6, s7);
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128)));
+  int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128)));
+  int8x16_t s45_128 = vreinterpretq_s8_u8(vsubq_u8(s45, vdupq_n_u8(128)));
+  int8x16_t s67_128 = vreinterpretq_s8_u8(vsubq_u8(s67, vdupq_n_u8(128)));
+
+  int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters);
+  int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters);
+  int32x4_t sum45 = vdotq_s32(horiz_const, s45_128, filters);
+  int32x4_t sum67 = vdotq_s32(horiz_const, s67_128, filters);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int32x4_t sum4567 = vpaddq_s32(sum45, sum67);
+
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_horiz_scale_neon_dotprod(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter, const int subpel_x_qn,
+    const int x_step_qn) {
+  DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
+  const int bd = 8;
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_offset =
+      (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1));
+  // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
+  // from every source value.
+  const int32_t dotprod_offset = 128 << FILTER_BITS;
+  // Divide the total by 4: we halved the filter values and will use a pairwise
+  // add in the convolution kernel.
+  const int32x4_t horiz_offset_vec =
+      vdupq_n_s32((horiz_offset + dotprod_offset) >> 2);
+
+  if (w == 4) {
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; r++) {
+        const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+        // Filter values are all even so halve them to fit in int8_t.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset_vec);
+
+        vst1_s16(&temp[r * 4], d0);
+
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      int16x4_t d0, d1, d2, d3;
+      load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int x_qn = subpel_x_qn;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; r++) {
+          const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
+
+          const ptrdiff_t filter_offset =
+              SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+          // Filter values are all even so halve them to fit in int8_t.
+          int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter,
+                                       horiz_offset_vec);
+
+          vst1q_s16(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+void av1_convolve_2d_scale_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params) {
+  if (w < 4 || h < 4) {
+    av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, filter_params_y, subpel_x_qn,
+                            x_step_qn, subpel_y_qn, y_step_qn, conv_params);
+    return;
+  }
+
+  // For the interpolation 8-tap filters are used.
+  assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
+
+  DECLARE_ALIGNED(32, int16_t,
+                  im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
+
+  // Horizontal filter
+  convolve_horiz_scale_neon_dotprod(
+      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  // Vertical filter
+  if (UNLIKELY(conv_params->is_compound)) {
+    if (conv_params->do_average) {
+      if (conv_params->use_dist_wtd_comp_avg) {
+        compound_dist_wtd_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+      } else {
+        compound_avg_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      compound_convolve_vert_scale_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
+  } else {
+    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             filter_params_y->filter_ptr, subpel_y_qn,
+                             y_step_qn);
+  }
+}
diff --git a/av1/common/arm/convolve_scale_neon.h b/av1/common/arm/convolve_scale_neon.h
new file mode 100644
index 0000000000..c000e44f36
--- /dev/null
+++ b/av1/common/arm/convolve_scale_neon.h
@@ -0,0 +1,480 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t compound_convolve8_4_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE int16x8_t compound_convolve8_8_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+  return vcombine_s16(res0, res1);
+}
+
+static INLINE void compound_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      vst1_u16(dst, vreinterpret_u16_s16(d0));
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint16_t *d = dst;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        vst1q_u16(d, vreinterpretq_u16_s16(d0));
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_avg_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts
+  // on modern CPUs.
+  const int32_t vert_offset_bits =
+      (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
+  // For the averaging code path substract round offset and convolve round.
+  const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
+  const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int16x4_t avg = vhadd_s16(dd0, d0);
+      int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
+
+      uint8x8_t d0_u8 = vqrshrun_n_s16(
+          d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int16x8_t avg = vhaddq_s16(dd0, d0);
+
+        uint8x8_t d0_u8 = vqrshrun_n_s16(
+            avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  int y_qn = subpel_y_qn;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+  // For the weighted averaging code path we have to substract round offset and
+  // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
+  // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
+  // additional shift by DIST_PRECISION_BITS is needed in order to merge two
+  // shift calculations into one.
+  const int32x4_t dist_wtd_offset = vdupq_n_s32(
+      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
+             DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
+  const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
+  const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
+      dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
+
+      int16x4_t d0_s16 = vshrn_n_s32(
+          dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                           DIST_PRECISION_BITS);
+
+      uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int32x4_t dst_wtd_avg0 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
+        int32x4_t dst_wtd_avg1 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
+
+        dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
+        dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
+
+        int16x4_t d0_s16_0 = vshrn_n_s32(
+            dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+        int16x4_t d0_s16_1 = vshrn_n_s32(
+            dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+
+        uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
+}
+
+static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res0, res1));
+}
+
+static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
+                                            uint8_t *dst, int dst_stride, int w,
+                                            int h, const int16_t *y_filter,
+                                            int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
+  // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
+  int32x4_t vert_offset =
+      vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
+
+  int y_qn = subpel_y_qn;
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      store_u8_4x1(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else if (w == 8) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      vst1_u8(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+      uint8_t *d = dst;
+      int width = w;
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      do {
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
+                     &s5[0], &s6[0], &s7[0]);
+        load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
+                     &s5[1], &s6[1], &s7[1]);
+
+        uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                     s6[0], s7[0], filter, vert_offset);
+        uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                     s6[1], s7[1], filter, vert_offset);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7fa8b4e6d0..f70dce4442 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -606,7 +606,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_convolve_x_sr_intrabc neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_y_sr_intrabc neon/;
-  specialize qw/av1_convolve_2d_scale sse4_1 neon/;
+  specialize qw/av1_convolve_2d_scale sse4_1 neon neon_dotprod/;
   specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
   specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index a428f41f53..2a4bae48ba 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -394,6 +394,13 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kBlockDim)));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon_dotprod),
+                       ::testing::ValuesIn(kBlockDim)));
+#endif  // HAVE_NEON_DOTPROD
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, LowBDConvolveScaleTest,
-- 
GitLab


From 1d0ab49189c19a756015503b529919f145f75f7f Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Sat, 25 May 2024 17:04:32 +0200
Subject: [PATCH 174/391] Add Neon I8MM implementation for
 av1_convolve_2d_scale

Add an Armv8.6 I8MM implementation for the horizontal filtering part
of av1_convolve_2d_scale.

Add the corresponding tests as well.

Change-Id: Id27aad4e4ce24266cfc75ac8073c38be3fc75eb9
---
 av1/av1.cmake                                 |   1 +
 av1/common/arm/av1_convolve_scale_neon_i8mm.c | 220 ++++++++++++++++++
 av1/common/av1_rtcd_defs.pl                   |   2 +-
 test/av1_convolve_scale_test.cc               |   7 +
 4 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 av1/common/arm/av1_convolve_scale_neon_i8mm.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index f67778a202..99ce3fb68b 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -409,6 +409,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD
             "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM
+            "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_i8mm.c"
             "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c"
             "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c")
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
new file mode 100644
index 0000000000..d1d0ae0c91
--- /dev/null
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/arm/convolve_scale_neon.h"
+
+static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
+                                      const uint8x8_t s2, const uint8x8_t s3,
+                                      const int8x8_t filter,
+                                      const int32x4_t horiz_const) {
+  const int8x16_t filters = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+
+  int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters);
+  int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters);
+
+  int32x4_t sum = vpaddq_s32(sum01, sum23);
+
+  // We halved the filter values so -1 from right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
+                                      const uint8x8_t s2, const uint8x8_t s3,
+                                      const uint8x8_t s4, const uint8x8_t s5,
+                                      const uint8x8_t s6, const uint8x8_t s7,
+                                      const int8x8_t filter,
+                                      const int32x4_t horiz_const) {
+  const int8x16_t filters = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+  uint8x16_t s45 = vcombine_u8(s4, s5);
+  uint8x16_t s67 = vcombine_u8(s6, s7);
+
+  int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters);
+  int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters);
+  int32x4_t sum45 = vusdotq_s32(horiz_const, s45, filters);
+  int32x4_t sum67 = vusdotq_s32(horiz_const, s67, filters);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int32x4_t sum4567 = vpaddq_s32(sum45, sum67);
+
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_horiz_scale_neon_i8mm(const uint8_t *src,
+                                                  int src_stride, int16_t *dst,
+                                                  int dst_stride, int w, int h,
+                                                  const int16_t *x_filter,
+                                                  const int subpel_x_qn,
+                                                  const int x_step_qn) {
+  DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
+  const int bd = 8;
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // Divide the total by 4: we halved the filter values and will use a pairwise
+  // add in the convolution kernel.
+  const int32x4_t horiz_offset = vdupq_n_s32(
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) >> 2);
+
+  if (w == 4) {
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; r++) {
+        const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+        // Filter values are all even so halve them to fit in int8_t.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset);
+
+        vst1_s16(&temp[r * 4], d0);
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      int16x4_t d0, d1, d2, d3;
+      load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int x_qn = subpel_x_qn;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; r++) {
+          const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
+
+          const ptrdiff_t filter_offset =
+              SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+          // Filter values are all even so halve them to fit in int8_t.
+          const int8x8_t filter =
+              vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter,
+                                       horiz_offset);
+
+          vst1q_s16(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn, const int x_step_qn,
+                                     const int subpel_y_qn, const int y_step_qn,
+                                     ConvolveParams *conv_params) {
+  if (w < 4 || h < 4) {
+    av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, filter_params_y, subpel_x_qn,
+                            x_step_qn, subpel_y_qn, y_step_qn, conv_params);
+    return;
+  }
+
+  // For the interpolation 8-tap filters are used.
+  assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
+
+  DECLARE_ALIGNED(32, int16_t,
+                  im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
+
+  // Horizontal filter
+  convolve_horiz_scale_neon_i8mm(
+      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  // Vertical filter
+  if (UNLIKELY(conv_params->is_compound)) {
+    if (conv_params->do_average) {
+      if (conv_params->use_dist_wtd_comp_avg) {
+        compound_dist_wtd_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+      } else {
+        compound_avg_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      compound_convolve_vert_scale_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
+  } else {
+    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             filter_params_y->filter_ptr, subpel_y_qn,
+                             y_step_qn);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f70dce4442..e27613a19f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -606,7 +606,7 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/av1_convolve_x_sr_intrabc neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_convolve_y_sr_intrabc neon/;
-  specialize qw/av1_convolve_2d_scale sse4_1 neon neon_dotprod/;
+  specialize qw/av1_convolve_2d_scale sse4_1 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
   specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index 2a4bae48ba..c7debe27ab 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -401,6 +401,13 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kBlockDim)));
 #endif  // HAVE_NEON_DOTPROD
 
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+    NEON_I8MM, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon_i8mm),
+                       ::testing::ValuesIn(kBlockDim)));
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, LowBDConvolveScaleTest,
-- 
GitLab


From cd4d6d1d2aebb495891e47b4efa7f178e337a2d8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 30 May 2024 16:31:30 -0700
Subject: [PATCH 175/391] Fix ClangTidy misc-include-cleaner warnings

Fix the following warnings:
  no header providing "int16_t" is directly included
  no header providing "ROUND0_BITS" is directly included
  no header providing "SCALE_EXTRA_BITS" is directly included
  no header providing "InterpFilterParams" is directly included
  no header providing "MAX_FILTER_TAP" is directly included
  no header providing "UNLIKELY" is directly included

Change-Id: I94bdb705a460f63a3566ef1f0ad7539af0f6dafc
---
 av1/common/arm/av1_convolve_scale_neon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index d9c2967d36..f1e6732abe 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -11,13 +11,18 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <stdint.h>
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/arm/convolve_scale_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
 
 static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
-- 
GitLab


From 18925daf07b4b7781f8cef73424d1bed734d8ec1 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 30 May 2024 20:32:48 -0700
Subject: [PATCH 176/391] rtc: Condition QP adjustment on rc->q_1/2_frame > 0

This fix only affects the initial QPs after very first
frame for TL enhancement frames in CBR mode.
Move the check that rc->q_1/2_frame is set to the top
of the adjustment logic, otherwise if there are dropped
frames after the very first frame, the QP can end up
with oscillation on temporal enhancement frames.

Because of some further logic/adjustments this
behavior would likely only happen when the TL enhancement
frame_bandwidth is not smaller then TL0 frame_bandwidth.

Bug: b/343429192
Change-Id: I121934c2c7e7522e4e40762191ed3005437841c2
---
 av1/encoder/ratectrl.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index af707be5b5..86dbdcaf7f 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -588,18 +588,19 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
   // Apply some control/clamp to QP under certain conditions.
   // Delay the use of the clamping for svc until after num_temporal_layers,
   // to make they have been set for each temporal layer.
+  // Check for rc->q_1/2_frame > 0 in case they have not been set due to
+  // dropped frames.
   if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 &&
+      rc->q_1_frame > 0 && rc->q_2_frame > 0 &&
       (!cpi->ppi->use_svc ||
        svc->current_superframe > (unsigned int)svc->number_temporal_layers) &&
       !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
       (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
        !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
     // If in the previous two frames we have seen both overshoot and undershoot
-    // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have
-    // not been set due to dropped frames.
+    // clamp Q between the two.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
-        rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
-        rc->q_2_frame > 0 && !overshoot_buffer_low) {
+        rc->q_1_frame != rc->q_2_frame && !overshoot_buffer_low) {
       int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
                          AOMMAX(rc->q_1_frame, rc->q_2_frame));
       // If the previous frame had overshoot and the current q needs to
-- 
GitLab


From 5a44b2f044f22a64685f5e26a492c0b90268c4a3 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 23 May 2024 14:39:59 +0100
Subject: [PATCH 177/391] Improve AArch64 performance of highbd warp affine

The common (non-boundary) code path for warp affine currently loads two
vectors of data and then uses EXT repeatedly to set multiple vectors
starting at each possible offset.

We can avoid the need for the EXT instructions by instead loading at the
multiple offsets directly, trading more loads for fewer vector
instructions. In the uncommon boundary code path we simply keep the EXT
instructions as they were before.

On a Neoverse V2 machine with LLVM 17, this reduces the times reported
by the highbd warp affine speed tests by a geomean of ~4.2%.

Change-Id: Ic0e66b76daa44c3d31d7022e57734cbec5da1af4
---
 av1/common/arm/highbd_warp_plane_neon.c |  68 ++-------
 av1/common/arm/highbd_warp_plane_neon.h | 175 +++++++++++++++++-------
 av1/common/arm/highbd_warp_plane_sve.c  |  68 ++-------
 3 files changed, 144 insertions(+), 167 deletions(-)

diff --git a/av1/common/arm/highbd_warp_plane_neon.c b/av1/common/arm/highbd_warp_plane_neon.c
index 89647bc921..51bf142fed 100644
--- a/av1/common/arm/highbd_warp_plane_neon.c
+++ b/av1/common/arm/highbd_warp_plane_neon.c
@@ -24,19 +24,11 @@
 #include "highbd_warp_plane_neon.h"
 
 static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) {
+highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2,
+                                int16x8_t rv3, int bd, int sx, int alpha) {
   int16x8_t f[4];
   load_filters_4(f, sx, alpha);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-
   int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
   m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
   int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
@@ -57,28 +49,12 @@ highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) {
   return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
 }
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) {
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) {
   int16x8_t f[8];
   load_filters_8(f, sx, alpha);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 4);
-  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 5);
-  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 6);
-  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 7);
-
   int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
   m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
   int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
@@ -112,18 +88,10 @@ highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) {
 }
 
 static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) {
+highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2,
+                                int16x8_t rv3, int bd, int sx) {
   int16x8_t f = load_filters_1(sx);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-
   int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0));
   m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0));
   int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1));
@@ -144,27 +112,11 @@ highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) {
   return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
 }
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx) {
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) {
   int16x8_t f = load_filters_1(sx);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 4);
-  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 5);
-  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 6);
-  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 7);
-
   int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0));
   m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0));
   int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1));
diff --git a/av1/common/arm/highbd_warp_plane_neon.h b/av1/common/arm/highbd_warp_plane_neon.h
index 48af4a707b..2ec45d1e0d 100644
--- a/av1/common/arm/highbd_warp_plane_neon.h
+++ b/av1/common/arm/highbd_warp_plane_neon.h
@@ -24,16 +24,19 @@
 #include "config/av1_rtcd.h"
 
 static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha);
+highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2,
+                                int16x8_t rv3, int bd, int sx, int alpha);
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha);
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha);
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx);
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f1(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx);
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx);
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx);
 
 static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp,
                                                          int sy);
@@ -99,6 +102,29 @@ static AOM_FORCE_INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val,
   return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit)));
 }
 
+static AOM_FORCE_INLINE uint16x8x2_t clamp_horizontal(
+    uint16x8x2_t src_1, int out_of_boundary_left, int out_of_boundary_right,
+    const uint16_t *ref, int iy, int stride, int width, const uint16x8_t indx0,
+    const uint16x8_t indx1) {
+  if (out_of_boundary_left >= 0) {
+    uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left);
+    uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]);
+    uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec);
+    uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec);
+    src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]);
+    src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]);
+  }
+  if (out_of_boundary_right >= 0) {
+    uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right);
+    uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]);
+    uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec);
+    uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec);
+    src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]);
+    src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]);
+  }
+  return src_1;
+}
+
 static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref,
                                                     int width, int height,
                                                     int stride, int p_width,
@@ -134,73 +160,120 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref,
   const int out_of_boundary_left = -(ix4 - 6);
   const int out_of_boundary_right = (ix4 + 8) - width;
 
-#define APPLY_HORIZONTAL_SHIFT(fn, ...)                                   \
-  do {                                                                    \
-    if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) {        \
-      for (int k = 0; k < 15; ++k) {                                      \
-        const int iy = clamp(iy4 + k - 7, 0, height - 1);                 \
-        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);   \
-                                                                          \
-        if (out_of_boundary_left >= 0) {                                  \
-          uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left);         \
-          uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]);             \
-          uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec);                   \
-          uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec);                   \
-          src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]);         \
-          src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]);         \
-        }                                                                 \
-        if (out_of_boundary_right >= 0) {                                 \
-          uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right);   \
-          uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); \
-          uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec);                   \
-          uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec);                   \
-          src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]);         \
-          src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]);         \
-        }                                                                 \
-        tmp[k] = (fn)(src_1, __VA_ARGS__);                                \
-      }                                                                   \
-    } else {                                                              \
-      for (int k = 0; k < 15; ++k) {                                      \
-        const int iy = clamp(iy4 + k - 7, 0, height - 1);                 \
-        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);   \
-        tmp[k] = (fn)(src_1, __VA_ARGS__);                                \
-      }                                                                   \
-    }                                                                     \
+#define APPLY_HORIZONTAL_SHIFT_4X1(fn, ...)                                \
+  do {                                                                     \
+    if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) {         \
+      for (int k = 0; k < 15; ++k) {                                       \
+        const int iy = clamp(iy4 + k - 7, 0, height - 1);                  \
+        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);    \
+        src_1 = clamp_horizontal(src_1, out_of_boundary_left,              \
+                                 out_of_boundary_right, ref, iy, stride,   \
+                                 width, indx0, indx1);                     \
+        int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),     \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 0); \
+        int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),     \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 1); \
+        int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),     \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 2); \
+        int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),     \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 3); \
+        tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__);                    \
+      }                                                                    \
+    } else {                                                               \
+      for (int k = 0; k < 15; ++k) {                                       \
+        const int iy = clamp(iy4 + k - 7, 0, height - 1);                  \
+        const uint16_t *src = ref + iy * stride + ix4;                     \
+        int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7));         \
+        int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6));         \
+        int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5));         \
+        int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4));         \
+        tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__);                    \
+      }                                                                    \
+    }                                                                      \
+  } while (0)
+
+#define APPLY_HORIZONTAL_SHIFT_8X1(fn, ...)                                 \
+  do {                                                                      \
+    if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) {          \
+      for (int k = 0; k < 15; ++k) {                                        \
+        const int iy = clamp(iy4 + k - 7, 0, height - 1);                   \
+        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);     \
+        src_1 = clamp_horizontal(src_1, out_of_boundary_left,               \
+                                 out_of_boundary_right, ref, iy, stride,    \
+                                 width, indx0, indx1);                      \
+        int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 0);  \
+        int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 1);  \
+        int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 2);  \
+        int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 3);  \
+        int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 4);  \
+        int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 5);  \
+        int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 6);  \
+        int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]),      \
+                                  vreinterpretq_s16_u16(src_1.val[1]), 7);  \
+        tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \
+      }                                                                     \
+    } else {                                                                \
+      for (int k = 0; k < 15; ++k) {                                        \
+        const int iy = clamp(iy4 + k - 7, 0, height - 1);                   \
+        const uint16_t *src = ref + iy * stride + ix4;                      \
+        int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7));          \
+        int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6));          \
+        int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5));          \
+        int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4));          \
+        int16x8_t rv4 = vreinterpretq_s16_u16(vld1q_u16(src - 3));          \
+        int16x8_t rv5 = vreinterpretq_s16_u16(vld1q_u16(src - 2));          \
+        int16x8_t rv6 = vreinterpretq_s16_u16(vld1q_u16(src - 1));          \
+        int16x8_t rv7 = vreinterpretq_s16_u16(vld1q_u16(src - 0));          \
+        tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \
+      }                                                                     \
+    }                                                                       \
   } while (0)
 
   if (p_width == 4) {
     if (beta == 0) {
       if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, sx4);
+        APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd, sx4);
       } else {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha);
+        APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd, sx4,
+                                   alpha);
       }
     } else {
       if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd,
-                               (sx4 + beta * (k - 3)));
+        APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd,
+                                   (sx4 + beta * (k - 3)));
       } else {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd,
-                               (sx4 + beta * (k - 3)), alpha);
+        APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd,
+                                   (sx4 + beta * (k - 3)), alpha);
       }
     }
   } else {
     if (beta == 0) {
       if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, sx4);
+        APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd, sx4);
       } else {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha);
+        APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd, sx4,
+                                   alpha);
       }
     } else {
       if (alpha == 0) {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd,
-                               (sx4 + beta * (k - 3)));
+        APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd,
+                                   (sx4 + beta * (k - 3)));
       } else {
-        APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd,
-                               (sx4 + beta * (k - 3)), alpha);
+        APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd,
+                                   (sx4 + beta * (k - 3)), alpha);
       }
     }
   }
+
+#undef APPLY_HORIZONTAL_SHIFT_4X1
+#undef APPLY_HORIZONTAL_SHIFT_8X1
 }
 
 static AOM_FORCE_INLINE void highbd_vertical_filter_4x1_f4(
diff --git a/av1/common/arm/highbd_warp_plane_sve.c b/av1/common/arm/highbd_warp_plane_sve.c
index 87e033fd00..c2e1e995bd 100644
--- a/av1/common/arm/highbd_warp_plane_sve.c
+++ b/av1/common/arm/highbd_warp_plane_sve.c
@@ -25,19 +25,11 @@
 #include "highbd_warp_plane_neon.h"
 
 static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) {
+highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2,
+                                int16x8_t rv3, int bd, int sx, int alpha) {
   int16x8_t f[4];
   load_filters_4(f, sx, alpha);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-
   int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
   int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
   int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
@@ -55,28 +47,12 @@ highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) {
   return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
 }
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) {
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) {
   int16x8_t f[8];
   load_filters_8(f, sx, alpha);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 4);
-  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 5);
-  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 6);
-  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 7);
-
   int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
   int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
   int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
@@ -104,18 +80,10 @@ highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) {
 }
 
 static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) {
+highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2,
+                                int16x8_t rv3, int bd, int sx) {
   int16x8_t f = load_filters_1(sx);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-
   int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
   int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
   int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
@@ -133,27 +101,11 @@ highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) {
   return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
 }
 
-static AOM_FORCE_INLINE int16x8_t
-highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx) {
+static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1(
+    int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4,
+    int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) {
   int16x8_t f = load_filters_1(sx);
 
-  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 0);
-  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 1);
-  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 2);
-  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 3);
-  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 4);
-  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 5);
-  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 6);
-  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
-                            vreinterpretq_s16_u16(in.val[1]), 7);
-
   int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
   int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
   int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
-- 
GitLab


From 1c1f80b4ba7e28fbd42fed6a15a02e982973ed5d Mon Sep 17 00:00:00 2001
From: Paul Wilkins <paulwilkins@google.com>
Date: Thu, 23 May 2024 17:56:57 +0100
Subject: [PATCH 178/391] Fix further overflow issue in VBR.

This patch fixes some additional cases where under extreme conditions
some of the VBR adjustment variables can wrap.

As this happens on a per frame level the extra saturation checks should
not be an issue for performance.

Change-Id: If67a39ed92a7c0358f4006b946c6a2b4225db545
---
 av1/encoder/pass2_strategy.c | 19 +++++++++++-------
 av1/encoder/ratectrl.c       | 37 ++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 8618212f66..6b63afc399 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -3401,14 +3401,18 @@ static int get_section_target_bandwidth(AV1_COMP *cpi) {
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->ppi->twopass;
-  int section_target_bandwidth;
+  int64_t section_target_bandwidth;
   const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
                                 current_frame->frame_number);
   if (cpi->ppi->lap_enabled)
-    section_target_bandwidth = (int)rc->avg_frame_bandwidth;
-  else
-    section_target_bandwidth = (int)(twopass->bits_left / frames_left);
-  return section_target_bandwidth;
+    section_target_bandwidth = rc->avg_frame_bandwidth;
+  else {
+    section_target_bandwidth = twopass->bits_left / frames_left;
+    section_target_bandwidth = (section_target_bandwidth < INT_MAX)
+                                   ? section_target_bandwidth
+                                   : INT_MAX;
+  }
+  return (int)section_target_bandwidth;
 }
 
 static INLINE void set_twopass_params_based_on_fp_stats(
@@ -4267,8 +4271,9 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
       if (rc->projected_frame_size < fast_extra_thresh) {
         p_rc->vbr_bits_off_target_fast +=
             fast_extra_thresh - rc->projected_frame_size;
-        p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
-                                                (4 * rc->avg_frame_bandwidth));
+        p_rc->vbr_bits_off_target_fast =
+            AOMMIN(p_rc->vbr_bits_off_target_fast,
+                   (4 * (int64_t)rc->avg_frame_bandwidth));
       }
     }
 
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 86dbdcaf7f..148334aa8a 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2595,6 +2595,8 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
 #else
   int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
 #endif
+  int64_t frame_target = *this_frame_target;
+
   const int stats_count =
       cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
           ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
@@ -2603,13 +2605,13 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
       16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
   assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
   if (frame_window > 0) {
-    const int max_delta = (int)AOMMIN(
-        abs((int)(vbr_bits_off_target / frame_window)),
-        ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+    const int64_t max_delta =
+        AOMMIN(llabs((vbr_bits_off_target / frame_window)),
+               (frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
 
     // vbr_bits_off_target > 0 means we have extra bits to spend
     // vbr_bits_off_target < 0 we are currently overshooting
-    *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
+    frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
   }
 
 #if CONFIG_FPMT_TEST
@@ -2626,32 +2628,35 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
       p_rc->vbr_bits_off_target_fast &&
 #endif
       !rc->is_src_frame_alt_ref) {
-    int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
-    int fast_extra_bits;
+    int64_t one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, frame_target);
+    int64_t fast_extra_bits;
 #if CONFIG_FPMT_TEST
-    fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
     fast_extra_bits =
-        (int)AOMMIN(fast_extra_bits,
-                    AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
+        AOMMIN(fast_extra_bits,
+               AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
 #else
+    fast_extra_bits = AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
     fast_extra_bits =
-        (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
-    fast_extra_bits = (int)AOMMIN(
-        fast_extra_bits,
-        AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+        AOMMIN(fast_extra_bits,
+               AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
 #endif
+    fast_extra_bits = (fast_extra_bits < INT_MAX) ? fast_extra_bits : INT_MAX;
     if (fast_extra_bits > 0) {
-      // Update this_frame_target only if additional bits are available from
+      // Update frame_target only if additional bits are available from
       // local undershoot.
-      *this_frame_target += (int)fast_extra_bits;
+      frame_target += fast_extra_bits;
     }
     // Store the fast_extra_bits of the frame and reduce it from
     // vbr_bits_off_target_fast during postencode stage.
-    rc->frame_level_fast_extra_bits = fast_extra_bits;
+    rc->frame_level_fast_extra_bits = (int)fast_extra_bits;
     // Retaining the condition to udpate during postencode stage since
     // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
     cpi->do_update_vbr_bits_off_target_fast = 1;
   }
+
+  // Clamp the target for the frame to the maximum allowed for one frame.
+  *this_frame_target = (int)((frame_target < INT_MAX) ? frame_target : INT_MAX);
 }
 
 void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
-- 
GitLab


From a0f61021becd361837e07a0dc943f78da5cac39a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 30 May 2024 16:28:54 -0700
Subject: [PATCH 179/391] av1_compute_stats_sve: add missing height check

downsample_height may be 0; compute_stats_win7_sve and
compute_stats_win5_sve assume it is at least 1. This fixes a crash with
heights < downsample_factor as exhibited by
SearchWienerTest.8bitSignedIntegerOverflowInLinsolveWiener. The checks
align with those in av1_compute_stats_neon().

Change-Id: Ia3c8210816a71213640a79c84714c5ae8d2ee1a9
---
 av1/encoder/arm/pickrst_sve.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/av1/encoder/arm/pickrst_sve.c b/av1/encoder/arm/pickrst_sve.c
index e865dadd41..5d7370b5da 100644
--- a/av1/encoder/arm/pickrst_sve.c
+++ b/av1/encoder/arm/pickrst_sve.c
@@ -11,6 +11,7 @@
 
 #include <arm_neon.h>
 #include <arm_sve.h>
+#include <assert.h>
 #include <string.h>
 
 #include "config/aom_config.h"
@@ -159,6 +160,7 @@ static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
   int64_t H_tmp[49 * 49];
   memset(H_tmp, 0, sizeof(H_tmp));
 
+  assert(height > 0);
   do {
     // Cross-correlation (M).
     for (int row = 0; row < wiener_win; row++) {
@@ -292,6 +294,7 @@ static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride,
   int64_t H_tmp[25 * 25];
   memset(H_tmp, 0, sizeof(H_tmp));
 
+  assert(height > 0);
   do {
     // Cross-correlation (M).
     for (int row = 0; row < wiener_win; row++) {
@@ -435,12 +438,14 @@ void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd,
   // the last line of src will be scaled according to how many rows remain.
   const int downsample_remainder = height % downsample_factor;
 
-  if (wiener_win == WIENER_WIN) {
-    compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
-                           width, downsample_height, M, H, downsample_factor);
-  } else {
-    compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
-                           width, downsample_height, M, H, downsample_factor);
+  if (downsample_height > 0) {
+    if (wiener_win == WIENER_WIN) {
+      compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                             width, downsample_height, M, H, downsample_factor);
+    } else {
+      compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                             width, downsample_height, M, H, downsample_factor);
+    }
   }
 
   if (downsample_remainder > 0) {
-- 
GitLab


From afb5237d908bd04ce6ea15eae85db4a54cba289c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 3 Jun 2024 11:13:53 -0700
Subject: [PATCH 180/391] Guard DECLS(ssse3), FNS(ssse3) with #if HAVE_SSSE3

The aom_dsp/x86/variance_sse2.c file is compiled if the ENABLE_SSE2
cmake option is enabled. The file also contains some SSSE3 code. That
code should only be compiled if the ENABLE_SSSE3 cmake option is
enabled.

Bug: aomedia:3578
Change-Id: I86685862e7da9506bd551b76228cdc9920c68ab1
---
 aom_dsp/x86/variance_sse2.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 81b30072a5..e71244f1c7 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -415,7 +415,9 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   DECL(8, opt);    \
   DECL(16, opt)
 
+#if HAVE_SSSE3
 DECLS(ssse3);
+#endif
 #undef DECLS
 #undef DECL
 
@@ -491,7 +493,9 @@ DECLS(ssse3);
   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
 #endif
 
+#if HAVE_SSSE3
 FNS(ssse3)
+#endif
 
 #undef FNS
 #undef FN
@@ -508,7 +512,9 @@ FNS(ssse3)
   DECL(8, opt);    \
   DECL(16, opt)
 
+#if HAVE_SSSE3
 DECLS(ssse3);
+#endif
 #undef DECL
 #undef DECLS
 
@@ -588,7 +594,9 @@ DECLS(ssse3);
   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
 #endif
 
+#if HAVE_SSSE3
 FNS(ssse3)
+#endif
 
 #undef FNS
 #undef FN
-- 
GitLab


From 14cdbaf87dcf3dec1973a31657d5f475906ef131 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Thu, 30 May 2024 10:01:32 +0200
Subject: [PATCH 181/391] Add 6-tap filter path for convolve_vert_scale

The filter values used in the scaling algorithm are specified in the
documentation of the inter-prediction process (chapter 7.11.3.4. [1]).
These filter values are defined in the av1_interp_filter_params_list
in filter.h. An important characteristic of these filters, except the
MULTITAP_SHARP filter, is that at indices 0 and 7 the values are 0.

Add an implementation for vertical filtering that specialises on
6-tap filters. This way we avoid redundant work associated with 8-tap
filters.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: Ib2863cd29541a3d69d128cce9da0485de8ad182f
---
 av1/common/arm/av1_convolve_scale_neon.c      |  54 ++-
 .../arm/av1_convolve_scale_neon_dotprod.c     |  54 ++-
 av1/common/arm/av1_convolve_scale_neon_i8mm.c |  54 ++-
 av1/common/arm/convolve_scale_neon.h          | 455 +++++++++++++++++-
 4 files changed, 568 insertions(+), 49 deletions(-)

diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index f1e6732abe..a972a19ef5 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -227,25 +227,51 @@ void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
       im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
 
   // Vertical filter
-  if (UNLIKELY(conv_params->is_compound)) {
-    if (conv_params->do_average) {
-      if (conv_params->use_dist_wtd_comp_avg) {
-        compound_dist_wtd_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
-            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+  if (filter_params_y->interp_filter == MULTITAP_SHARP) {
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+        }
       } else {
-        compound_avg_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+        compound_convolve_vert_scale_8tap_neon(
+            im_block, im_stride, dst16, dst16_stride, w, h,
             filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
       }
     } else {
-      compound_convolve_vert_scale_neon(
-          im_block, im_stride, dst16, dst16_stride, w, h,
-          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    filter_params_y->filter_ptr, subpel_y_qn,
+                                    y_step_qn);
     }
   } else {
-    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             filter_params_y->filter_ptr, subpel_y_qn,
-                             y_step_qn);
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, conv_params,
+              subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn,
+              y_step_qn);
+        }
+      } else {
+        compound_convolve_vert_scale_6tap_neon(
+            im_block + im_stride, im_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      convolve_vert_scale_6tap_neon(
+          im_block + im_stride, im_stride, dst, dst_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
   }
 }
diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
index 619efa65ed..a6f87f5144 100644
--- a/av1/common/arm/av1_convolve_scale_neon_dotprod.c
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -208,25 +208,51 @@ void av1_convolve_2d_scale_neon_dotprod(
       im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
 
   // Vertical filter
-  if (UNLIKELY(conv_params->is_compound)) {
-    if (conv_params->do_average) {
-      if (conv_params->use_dist_wtd_comp_avg) {
-        compound_dist_wtd_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
-            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+  if (filter_params_y->interp_filter == MULTITAP_SHARP) {
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+        }
       } else {
-        compound_avg_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+        compound_convolve_vert_scale_8tap_neon(
+            im_block, im_stride, dst16, dst16_stride, w, h,
             filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
       }
     } else {
-      compound_convolve_vert_scale_neon(
-          im_block, im_stride, dst16, dst16_stride, w, h,
-          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    filter_params_y->filter_ptr, subpel_y_qn,
+                                    y_step_qn);
     }
   } else {
-    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             filter_params_y->filter_ptr, subpel_y_qn,
-                             y_step_qn);
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, conv_params,
+              subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn,
+              y_step_qn);
+        }
+      } else {
+        compound_convolve_vert_scale_6tap_neon(
+            im_block + im_stride, im_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      convolve_vert_scale_6tap_neon(
+          im_block + im_stride, im_stride, dst, dst_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
   }
 }
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
index d1d0ae0c91..ab215ca637 100644
--- a/av1/common/arm/av1_convolve_scale_neon_i8mm.c
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -196,25 +196,51 @@ void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride,
       im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
 
   // Vertical filter
-  if (UNLIKELY(conv_params->is_compound)) {
-    if (conv_params->do_average) {
-      if (conv_params->use_dist_wtd_comp_avg) {
-        compound_dist_wtd_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
-            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+  if (filter_params_y->interp_filter == MULTITAP_SHARP) {
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_8tap_neon(
+              im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+              filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+        }
       } else {
-        compound_avg_convolve_vert_scale_neon(
-            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+        compound_convolve_vert_scale_8tap_neon(
+            im_block, im_stride, dst16, dst16_stride, w, h,
             filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
       }
     } else {
-      compound_convolve_vert_scale_neon(
-          im_block, im_stride, dst16, dst16_stride, w, h,
-          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    filter_params_y->filter_ptr, subpel_y_qn,
+                                    y_step_qn);
     }
   } else {
-    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
-                             filter_params_y->filter_ptr, subpel_y_qn,
-                             y_step_qn);
+    if (UNLIKELY(conv_params->is_compound)) {
+      if (conv_params->do_average) {
+        if (conv_params->use_dist_wtd_comp_avg) {
+          compound_dist_wtd_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, conv_params,
+              subpel_y_qn, y_step_qn);
+        } else {
+          compound_avg_convolve_vert_scale_6tap_neon(
+              im_block + im_stride, im_stride, dst, dst_stride, dst16,
+              dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn,
+              y_step_qn);
+        }
+      } else {
+        compound_convolve_vert_scale_6tap_neon(
+            im_block + im_stride, im_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      convolve_vert_scale_6tap_neon(
+          im_block + im_stride, im_stride, dst, dst_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
   }
 }
diff --git a/av1/common/arm/convolve_scale_neon.h b/av1/common/arm/convolve_scale_neon.h
index c000e44f36..2253b54037 100644
--- a/av1/common/arm/convolve_scale_neon.h
+++ b/av1/common/arm/convolve_scale_neon.h
@@ -75,7 +75,7 @@ static INLINE int16x8_t compound_convolve8_8_v(
   return vcombine_s16(res0, res1);
 }
 
-static INLINE void compound_convolve_vert_scale_neon(
+static INLINE void compound_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
@@ -138,7 +138,7 @@ static INLINE void compound_convolve_vert_scale_neon(
   }
 }
 
-static INLINE void compound_avg_convolve_vert_scale_neon(
+static INLINE void compound_avg_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     int subpel_y_qn, int y_step_qn) {
@@ -224,7 +224,7 @@ static INLINE void compound_avg_convolve_vert_scale_neon(
   }
 }
 
-static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
+static INLINE void compound_dist_wtd_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
@@ -392,10 +392,9 @@ static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(vcombine_s16(res0, res1));
 }
 
-static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
-                                            uint8_t *dst, int dst_stride, int w,
-                                            int h, const int16_t *y_filter,
-                                            int subpel_y_qn, int y_step_qn) {
+static INLINE void convolve_vert_scale_8tap_neon(
+    const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
@@ -477,4 +476,446 @@ static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
   }
 }
 
+static INLINE int16x4_t compound_convolve6_4_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t filter, const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+  return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE int16x8_t compound_convolve6_8_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t filter, const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+  return vcombine_s16(res0, res1);
+}
+
+static INLINE void compound_convolve_vert_scale_6tap_neon(
+    const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5;
+      load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+      int16x4_t d0 =
+          compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+      vst1_u16(dst, vreinterpret_u16_s16(d0));
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint16_t *d = dst;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5;
+        load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+        int16x8_t d0 =
+            compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+        vst1q_u16(d, vreinterpretq_u16_s16(d0));
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_avg_convolve_vert_scale_6tap_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts
+  // on modern CPUs.
+  const int32_t vert_offset_bits =
+      (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
+  // For the averaging code path substract round offset and convolve round.
+  const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
+  const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5;
+      load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+      int16x4_t d0 =
+          compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int16x4_t avg = vhadd_s16(dd0, d0);
+      int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
+
+      uint8x8_t d0_u8 = vqrshrun_n_s16(
+          d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5;
+        load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+        int16x8_t d0 =
+            compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int16x8_t avg = vhaddq_s16(dd0, d0);
+
+        uint8x8_t d0_u8 = vqrshrun_n_s16(
+            avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_dist_wtd_convolve_vert_scale_6tap_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  int y_qn = subpel_y_qn;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+  // For the weighted averaging code path we have to substract round offset and
+  // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
+  // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
+  // additional shift by DIST_PRECISION_BITS is needed in order to merge two
+  // shift calculations into one.
+  const int32x4_t dist_wtd_offset = vdupq_n_s32(
+      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
+             DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
+  const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
+  const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5;
+      load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+      int16x4_t d0 =
+          compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
+      dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
+
+      int16x4_t d0_s16 = vshrn_n_s32(
+          dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                           DIST_PRECISION_BITS);
+
+      uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5;
+        load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+        int16x8_t d0 =
+            compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int32x4_t dst_wtd_avg0 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
+        int32x4_t dst_wtd_avg1 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
+
+        dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
+        dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
+
+        int16x4_t d0_s16_0 = vshrn_n_s32(
+            dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+        int16x4_t d0_s16_1 = vshrn_n_s32(
+            dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+
+        uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+  int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
+}
+
+static INLINE uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res0, res1));
+}
+
+static INLINE void convolve_vert_scale_6tap_neon(
+    const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
+  // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
+  int32x4_t vert_offset =
+      vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
+
+  int y_qn = subpel_y_qn;
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5;
+      load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+      uint8x8_t d = convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+      store_u8_4x1(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else if (w == 8) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x8_t s0, s1, s2, s3, s4, s5;
+      load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5);
+
+      uint8x8_t d = convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset);
+
+      vst1_u8(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+      uint8_t *d = dst;
+      int width = w;
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      do {
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2];
+        load_s16_8x6(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
+                     &s5[0]);
+        load_s16_8x6(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
+                     &s5[1]);
+
+        uint8x8_t d0 = convolve6_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                     filter, vert_offset);
+        uint8x8_t d1 = convolve6_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                     filter, vert_offset);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
-- 
GitLab


From a88627a130a7b455cc14201d23bd11a75e038b5f Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Thu, 30 May 2024 16:32:07 +0200
Subject: [PATCH 182/391] Add Armv8.0 Neon 6-tap filter path for
 convolve_horiz_scale

The filter values used in the scaling algorithm are specified in the
documentation of the inter-prediction process (chapter 7.11.3.4. [1]).
These filter values are defined in the av1_interp_filter_params_list
in filter.h. An important characteristic of these filters, except the
MULTITAP_SHARP filter, is that at indices 0 and 7 the values are 0.

Add an implementation for horizontal filtering that specialises on
6-tap filters. This way we avoid redundant work associated with 8-tap
filters. This approach is not applicable for the DotProd and I8MM
version of this horizontal filtering, given that the dot product
instructions accumulate the result of 4 multiplications.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I09fcc138289c3b9ee39099b6cdab740e7049260c
---
 av1/common/arm/av1_convolve_scale_neon.c | 184 +++++++++++++++++++++--
 1 file changed, 175 insertions(+), 9 deletions(-)

diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index a972a19ef5..88d126ead3 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -68,12 +68,12 @@ static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
-                                             int16_t *dst, int dst_stride,
-                                             int w, int h,
-                                             const int16_t *x_filter,
-                                             const int subpel_x_qn,
-                                             const int x_step_qn) {
+static INLINE void convolve_horiz_scale_8tap_neon(const uint8_t *src,
+                                                  int src_stride, int16_t *dst,
+                                                  int dst_stride, int w, int h,
+                                                  const int16_t *x_filter,
+                                                  const int subpel_x_qn,
+                                                  const int x_step_qn) {
   DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
   const int bd = 8;
 
@@ -191,6 +191,166 @@ static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
   }
 }
 
+static INLINE int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x8_t filter,
+                                      const int32x4_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int32x4_t sum = horiz_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum = vmlal_lane_s16(sum, s0, filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s1, filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s2, filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s3, filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s4, filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s5, filter_hi, 2);
+
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t filter,
+                                      const int16x8_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = horiz_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s3, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 2);
+
+  // We halved the filter values so -1 from right shift.
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void convolve_horiz_scale_6tap_neon(const uint8_t *src,
+                                                  int src_stride, int16_t *dst,
+                                                  int dst_stride, int w, int h,
+                                                  const int16_t *x_filter,
+                                                  const int subpel_x_qn,
+                                                  const int x_step_qn) {
+  DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
+  const int bd = 8;
+
+  if (w == 4) {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+        const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s3 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+        int16x4_t d0 =
+            convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+
+        vst1_s16(&temp[r * 4], d0);
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      int16x4_t d0, d1, d2, d3;
+      load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+
+    do {
+      int x_qn = subpel_x_qn;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; ++r) {
+          const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
+
+          const ptrdiff_t filter_offset =
+              SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+          int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+          // Filter values are all even so halve them to allow convolution
+          // kernel computations to stay in 16-bit element types.
+          filter = vshrq_n_s16(filter, 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                                 &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+          int16x8_t d0 =
+              convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+
+          vst1q_s16(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
 void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -222,9 +382,15 @@ void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
   const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
 
   // Horizontal filter
-  convolve_horiz_scale_neon(
-      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+    convolve_horiz_scale_8tap_neon(
+        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  } else {
+    convolve_horiz_scale_6tap_neon(
+        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  }
 
   // Vertical filter
   if (filter_params_y->interp_filter == MULTITAP_SHARP) {
-- 
GitLab


From 893978644f37a05d28968bf066c17e6219a8a55e Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Tue, 14 May 2024 17:04:26 +0200
Subject: [PATCH 183/391] Add Armv8.0 Neon horiz 2x1 scale spec. impl for
 convolve_2d_scale

AV1 has a limit on the scale ratio, specifically, the reference
resolution cannot be more than 2 times the source resolution in any
dimension. Given that the algorithm uses higher precision
(1/1024-pel) for the step size (chapter 7.11.3.4. [1]), the
horizontal scaling function can be easily optimised for this specific
case. The indices of the source pixel to be interpolated are
calculated using the (subpel_qn + x * step) >> 1024 equation, which
can be simplified if step is a multiple of 1024.

Add implementation that specialises on x_step_qn equals to 2048, that
gives an uplift of around 33% when a 2x1 scaling is applied.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I9127ca4e6b4188a4dabe4cfd416efe4d762b2e9f
---
 aom_dsp/arm/mem_neon.h                   |  22 ++
 aom_dsp/arm/transpose_neon.h             |  69 +++++
 av1/common/arm/av1_convolve_scale_neon.c | 319 ++++++++++++++++++++++-
 3 files changed, 403 insertions(+), 7 deletions(-)

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index b5deb9ca34..fa571a68c3 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -642,6 +642,28 @@ static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
   vst1_s16(s, s3);
 }
 
+static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3,
+                                 const int16x4_t s4, const int16x4_t s5,
+                                 const int16x4_t s6, const int16x4_t s7) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+  s += dst_stride;
+  vst1_s16(s, s4);
+  s += dst_stride;
+  vst1_s16(s, s5);
+  s += dst_stride;
+  vst1_s16(s, s6);
+  s += dst_stride;
+  vst1_s16(s, s7);
+}
+
 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1,
                                  const int16x8_t s2, const int16x8_t s3) {
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 8027018235..9fc4fb075a 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -325,6 +325,41 @@ static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
   *a3 = vreinterpret_u8_u16(c1.val[1]);
 }
 
+static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
+                                                   uint8x16_t *a1,
+                                                   uint8x16_t *a2,
+                                                   uint8x16_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
+  // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
+  // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
+  // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
+  // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
+  // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
+  // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
+
+  const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
+  const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34 08  18  28  38  012 112 212 312
+  // c0.val[1]: 02 12 22 32 06 16 26 36 09  19  29  39  013 113 213 313
+  // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
+  // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpretq_u8_u16(c0.val[0]);
+  *a1 = vreinterpretq_u8_u16(c1.val[0]);
+  *a2 = vreinterpretq_u8_u16(c0.val[1]);
+  *a3 = vreinterpretq_u8_u16(c1.val[1]);
+}
+
 static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
                                                   uint8x8_t *a1) {
   // Swap 16 bit elements. Goes from:
@@ -885,6 +920,40 @@ static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
   out[7] = d3.val[1];
 }
 
+static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2,
+                                                   int16x8_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 01 11 21 31 05 15 25 35
+  // c1.val[0]: 02 12 22 32 06 16 26 36
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpretq_s16_s32(c0.val[0]);
+  *a1 = vreinterpretq_s16_s32(c1.val[0]);
+  *a2 = vreinterpretq_s16_s32(c0.val[1]);
+  *a3 = vreinterpretq_s16_s32(c1.val[1]);
+}
+
 static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
                                                    uint16x4_t *a1,
                                                    uint16x4_t *a2,
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index 88d126ead3..114232d50a 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -351,6 +351,284 @@ static INLINE void convolve_horiz_scale_6tap_neon(const uint8_t *src,
   }
 }
 
+static INLINE void convolve_horiz_scale_2_8tap_neon(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+
+  if (w == 4) {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    const int16x8_t filter = vld1q_s16(x_filter);
+
+    do {
+      uint8x16_t t0, t1, t2, t3;
+      load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+      int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+      int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+      int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+      int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+      int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+      int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s1 = vget_low_s16(tt1);
+      int16x4_t s2 = vget_low_s16(tt2);
+      int16x4_t s3 = vget_low_s16(tt3);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s5 = vget_high_s16(tt1);
+      int16x4_t s6 = vget_high_s16(tt2);
+      int16x4_t s7 = vget_high_s16(tt3);
+      int16x4_t s8 = vget_low_s16(tt4);
+      int16x4_t s9 = vget_low_s16(tt5);
+      int16x4_t s10 = vget_low_s16(tt6);
+      int16x4_t s11 = vget_low_s16(tt7);
+      int16x4_t s12 = vget_high_s16(tt4);
+      int16x4_t s13 = vget_high_s16(tt5);
+
+      int16x4_t d0 =
+          convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+      int16x4_t d1 =
+          convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+      int16x4_t d2 =
+          convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+      int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+                                   horiz_offset);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+    // Filter values are all even so halve them to allow convolution
+    // kernel computations to stay in 16-bit element types.
+    const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+                             &t4, &t5, &t6, &t7);
+
+      s += 8;
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+      do {
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                    &t15);
+        transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+                               &t10, &t11, &t12, &t13, &t14, &t15);
+
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+        int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+        int16x8_t d0 =
+            convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+        int16x8_t d1 =
+            convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+        int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter,
+                                     horiz_offset);
+        int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+                                     horiz_offset);
+
+        transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+        store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+                      vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+                      vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s7 = s15;
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+static INLINE void convolve_horiz_scale_2_6tap_neon(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+
+  if (w == 4) {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    const int16x8_t filter = vld1q_s16(x_filter);
+
+    do {
+      uint8x16_t t0, t1, t2, t3;
+      load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+      int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+      int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+      int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+      int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+      int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s1 = vget_low_s16(tt1);
+      int16x4_t s2 = vget_low_s16(tt2);
+      int16x4_t s3 = vget_high_s16(tt3);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s5 = vget_high_s16(tt1);
+      int16x4_t s6 = vget_high_s16(tt2);
+      int16x4_t s7 = vget_low_s16(tt4);
+      int16x4_t s8 = vget_low_s16(tt5);
+      int16x4_t s9 = vget_low_s16(tt6);
+      int16x4_t s10 = vget_low_s16(tt7);
+      int16x4_t s11 = vget_high_s16(tt4);
+
+      int16x4_t d0 =
+          convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+      int16x4_t d1 =
+          convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+      int16x4_t d2 =
+          convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+      int16x4_t d3 =
+          convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+    // Filter values are all even so halve them to allow convolution
+    // kernel computations to stay in 16-bit element types.
+    const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+                             &t4, &t5, &t6, &t7);
+
+      s += 8;
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+      do {
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                    &t15);
+        transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+                               &t10, &t11, &t12, &t13, &t14, &t15);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+        int16x8_t d0 =
+            convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+        int16x8_t d1 =
+            convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+        int16x8_t d2 =
+            convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+        int16x8_t d3 =
+            convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+        transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+        store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+                      vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+                      vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
 void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -382,14 +660,41 @@ void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
   const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
 
   // Horizontal filter
-  if (filter_params_x->interp_filter == MULTITAP_SHARP) {
-    convolve_horiz_scale_8tap_neon(
-        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) {
+    if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+      convolve_horiz_scale_8tap_neon(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    } else {
+      convolve_horiz_scale_6tap_neon(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    }
   } else {
-    convolve_horiz_scale_6tap_neon(
-        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS));
+    // The filter index is calculated using the
+    // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS
+    // equation, where the values of x are from 0 to w. If x_step_qn is a
+    // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation.
+    const ptrdiff_t filter_offset =
+        SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+    const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset;
+
+    // The source index is calculated using the (subpel_x_qn + x * x_step_qn)
+    // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If
+    // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 <<
+    // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the
+    // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)).
+    if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+      convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset,
+                                       src_stride, im_block, im_stride, w, im_h,
+                                       x_filter);
+    } else {
+      convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset,
+                                       src_stride, im_block, im_stride, w, im_h,
+                                       x_filter);
+    }
   }
 
   // Vertical filter
-- 
GitLab


From 11c031d7537c9def9f9bda5819264fa51081d23f Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Fri, 31 May 2024 12:13:05 +0200
Subject: [PATCH 184/391] Add Neon Dotprod horiz 2x1 scale spec. impl for
 convolve_2d_scale

AV1 has a limit on the scale ratio, specifically, the reference
resolution cannot be more than 2 times the source resolution in any
dimension. Given that the algorithm uses higher precision
(1/1024-pel) for the step size (chapter 7.11.3.4. [1]), the
horizontal scaling function can be easily optimised for this specific
case. The indices of the source pixel to be interpolated are
calculated using the (subpel_qn + x * step) >> 1024 equation, which
can be simplified if step is a multiple of 1024.

Add implementation that specialises on x_step_qn equals to 2048, that
gives an uplift of around 30% when a 2x1 scaling is applied.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I66ed594d9ef6afb114de6a8a2ead8af025d84017
---
 .../arm/av1_convolve_scale_neon_dotprod.c     | 167 +++++++++++++++++-
 1 file changed, 164 insertions(+), 3 deletions(-)

diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
index a6f87f5144..7c8eaa7b44 100644
--- a/av1/common/arm/av1_convolve_scale_neon_dotprod.c
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -19,6 +19,13 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/arm/convolve_scale_neon.h"
 
+// clang-format off
+DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9,
+  4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+};
+// clang-format on
+
 static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const int8x8_t filter,
@@ -173,6 +180,140 @@ static INLINE void convolve_horiz_scale_neon_dotprod(
   }
 }
 
+static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
+                                              const int8x8_t filters,
+                                              const int32x4_t horiz_const,
+                                              const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9 }
+  // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  int32x4_t sum = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
+                                              const int8x8_t filters,
+                                              const int32x4_t horiz_const,
+                                              const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples0_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128)));
+  int8x16_t samples1_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9 }
+  // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }
+  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples0_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples0_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples1_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples1_128, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0123 = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
+  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum4567 = vdotq_lane_s32(horiz_const, perm_samples[2], filters, 0);
+  sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filters, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_horiz_scale_2_neon_dotprod(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_offset =
+      (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1));
+  // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
+  // from every source value.
+  const int32_t dotprod_offset = 128 << FILTER_BITS;
+  // Divide the total by 2 because we halved the filter values.
+  const int32x4_t horiz_offset_vec =
+      vdupq_n_s32((horiz_offset + dotprod_offset) >> 1);
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl);
+  // Filter values are all even so halve them to fit in int8_t.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1);
+
+  if (w == 4) {
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 =
+            convolve8_4_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl);
+        int16x4_t d1 =
+            convolve8_4_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl);
+        int16x4_t d2 =
+            convolve8_4_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl);
+        int16x4_t d3 =
+            convolve8_4_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl);
+
+        store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve8_8_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl);
+        int16x8_t d1 =
+            convolve8_8_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl);
+        int16x8_t d2 =
+            convolve8_8_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl);
+        int16x8_t d3 =
+            convolve8_8_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 16;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
 void av1_convolve_2d_scale_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
@@ -203,9 +344,29 @@ void av1_convolve_2d_scale_neon_dotprod(
   const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
 
   // Horizontal filter
-  convolve_horiz_scale_neon_dotprod(
-      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) {
+    convolve_horiz_scale_neon_dotprod(
+        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  } else {
+    assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS));
+    // The filter index is calculated using the
+    // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS
+    // equation, where the values of x are from 0 to w. If x_step_qn is a
+    // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation.
+    const ptrdiff_t filter_offset =
+        SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+    const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset;
+
+    // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >>
+    // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn
+    // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0,
+    // the source index can be determined using the value x * (x_step_qn /
+    // (1 << SCALE_SUBPEL_BITS)).
+    convolve_horiz_scale_2_neon_dotprod(src - horiz_offset - vert_offset,
+                                        src_stride, im_block, im_stride, w,
+                                        im_h, x_filter);
+  }
 
   // Vertical filter
   if (filter_params_y->interp_filter == MULTITAP_SHARP) {
-- 
GitLab


From 96958d1ecc073f7deb84c99e675437a7a6457566 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 4 Jun 2024 11:30:47 -0700
Subject: [PATCH 185/391] Use round for RC calcutions in cyclic_refresh

This fixes the mismatch found in test behavior
between mac pro arm and linux x86_64.

Bug:aomedia:3579

Change-Id: I9ad2fc0c43edae4b9505d7a46336820a72ff6442
---
 av1/encoder/aq_cyclicrefresh.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 1aa8dde323..2ef6cba698 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -103,15 +103,15 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
     weight_segment2 = 0;
   }
   // Take segment weighted average for estimated bits.
-  const int estimated_bits =
-      (int)((1.0 - weight_segment1 - weight_segment2) *
-                av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
-            weight_segment1 *
-                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
-                                       correction_factor) +
-            weight_segment2 *
-                av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
-                                       correction_factor));
+  const int estimated_bits = (int)round(
+      (1.0 - weight_segment1 - weight_segment2) *
+          av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
+      weight_segment1 *
+          av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
+                                 correction_factor) +
+      weight_segment2 *
+          av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
+                                 correction_factor));
   return estimated_bits;
 }
 
@@ -139,13 +139,13 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
   int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate;
   // Take segment weighted average for bits per mb.
-  bits_per_mb =
-      (int)((1.0 - weight_segment) *
-                av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
-                                   correction_factor, accurate_estimate) +
-            weight_segment * av1_rc_bits_per_mb(
-                                 cpi, cm->current_frame.frame_type, i + deltaq,
-                                 correction_factor, accurate_estimate));
+  bits_per_mb = (int)round(
+      (1.0 - weight_segment) *
+          av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
+                             correction_factor, accurate_estimate) +
+      weight_segment * av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type,
+                                          i + deltaq, correction_factor,
+                                          accurate_estimate));
   return bits_per_mb;
 }
 
-- 
GitLab


From 065f9d377a532e514cd5027d9e12bcb8281758a4 Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Fri, 31 May 2024 12:26:26 +0200
Subject: [PATCH 186/391] Add Neon I8MM horiz 2x1 scale spec. impl for
 convolve_2d_scale

AV1 has a limit on the scale ratio, specifically, the reference
resolution cannot be more than 2 times the source resolution in any
dimension. Given that the algorithm uses higher precision
(1/1024-pel) for the step size (chapter 7.11.3.4. [1]), the
horizontal scaling function can be easily optimised for this specific
case. The indices of the source pixel to be interpolated are
calculated using the (subpel_qn + x * step) >> 1024 equation, which
can be simplified if step is a multiple of 1024.

Add implementation that specialises on x_step_qn equals to 2048, that
gives an uplift of around 28% when a 2x1 scaling is applied.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I87b0730d0a75c534813f154555cbdb473b445438
---
 av1/common/arm/av1_convolve_scale_neon_i8mm.c | 155 +++++++++++++++++-
 1 file changed, 152 insertions(+), 3 deletions(-)

diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
index ab215ca637..42de38f461 100644
--- a/av1/common/arm/av1_convolve_scale_neon_i8mm.c
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -19,6 +19,13 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/arm/convolve_scale_neon.h"
 
+// clang-format off
+DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9,
+  4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+};
+// clang-format on
+
 static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const int8x8_t filter,
@@ -160,6 +167,128 @@ static INLINE void convolve_horiz_scale_neon_i8mm(const uint8_t *src,
   }
 }
 
+static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
+                                              const int8x8_t filters,
+                                              const int32x4_t horiz_const,
+                                              const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9 }
+  // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
+                                              const int8x8_t filters,
+                                              const int32x4_t horiz_const,
+                                              const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9 }
+  // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0123 =
+      vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
+
+  // Second 4 output values.
+  int32x4_t sum4567 =
+      vusdotq_lane_s32(horiz_const, perm_samples[2], filters, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[3], filters, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_horiz_scale_2_neon_i8mm(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // The additional -1 is needed because we are halving the filter values.
+  const int32x4_t horiz_offset =
+      vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl);
+  // Filter values are all even so halve them to fit in int8_t.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1);
+
+  if (w == 4) {
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 =
+            convolve8_4_h_scale_2(s0, filter, horiz_offset, permute_tbl);
+        int16x4_t d1 =
+            convolve8_4_h_scale_2(s1, filter, horiz_offset, permute_tbl);
+        int16x4_t d2 =
+            convolve8_4_h_scale_2(s2, filter, horiz_offset, permute_tbl);
+        int16x4_t d3 =
+            convolve8_4_h_scale_2(s3, filter, horiz_offset, permute_tbl);
+
+        store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve8_8_h_scale_2(s0, filter, horiz_offset, permute_tbl);
+        int16x8_t d1 =
+            convolve8_8_h_scale_2(s1, filter, horiz_offset, permute_tbl);
+        int16x8_t d2 =
+            convolve8_8_h_scale_2(s2, filter, horiz_offset, permute_tbl);
+        int16x8_t d3 =
+            convolve8_8_h_scale_2(s3, filter, horiz_offset, permute_tbl);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 16;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
 void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
                                      const InterpFilterParams *filter_params_x,
@@ -191,9 +320,29 @@ void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride,
   const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
 
   // Horizontal filter
-  convolve_horiz_scale_neon_i8mm(
-      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) {
+    convolve_horiz_scale_neon_i8mm(
+        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+  } else {
+    assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS));
+    // The filter index is calculated using the
+    // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS
+    // equation, where the values of x are from 0 to w. If x_step_qn is a
+    // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation.
+    const ptrdiff_t filter_offset =
+        SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+    const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset;
+
+    // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >>
+    // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn
+    // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0,
+    // the source index can be determined using the value x * (x_step_qn /
+    // (1 << SCALE_SUBPEL_BITS)).
+    convolve_horiz_scale_2_neon_i8mm(src - horiz_offset - vert_offset,
+                                     src_stride, im_block, im_stride, w, im_h,
+                                     x_filter);
+  }
 
   // Vertical filter
   if (filter_params_y->interp_filter == MULTITAP_SHARP) {
-- 
GitLab


From 52f28ccd2eb4b11712cf1590058621f59961d0f3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 17 May 2024 19:47:50 -0700
Subject: [PATCH 187/391] blend_a64_mask_sse4.c: use xx_loadu_2x64 for
 unaligned loads

This quiets some undefined sanitizer warnings related to unaligned
loads; register/code reordering with gcc-13, no change with clang-16.

Bug: b:300649160
Change-Id: Ibf6c41448756fcadaaeae9b26f7ca25e6cec0f58
---
 aom_dsp/x86/blend_a64_mask_sse4.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 58a7345ec2..9a10e86ae5 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1120,14 +1120,12 @@ static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
     const __m128i *clip_low, const __m128i *clip_high,
     const __m128i *mask_max) {
   // Load 4 pixels from each of 4 rows from each source
-  const __m128i s0a =
-      _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
-  const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
-                                     *(int64_t *)(src0 + 3 * src0_stride));
-  const __m128i s1a =
-      _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
-  const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
-                                     *(int64_t *)(src1 + 3 * src1_stride));
+  const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride);
+  const __m128i s0b =
+      xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+  const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride);
+  const __m128i s1b =
+      xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
 
   // Generate the inverse masks
   const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
-- 
GitLab


From 5ba009b7a4b44435aa3a44f98fbb11c91ed3aa8e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 4 Jun 2024 12:36:13 -0700
Subject: [PATCH 188/391] av1_resize_horz_dir,cosmetics: normalize param name

filteredlength -> filtered_length

This matches the implementations in quiets a clang-tidy warning.

Change-Id: Ifb6721e827db14d389e0a1c35599317a55ee057a
---
 av1/common/av1_rtcd_defs.pl | 2 +-
 test/frame_resize_test.cc   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e27613a19f..a24d3a953f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -557,7 +557,7 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
-add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
+add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filtered_length, int width2";
 specialize qw/av1_resize_horz_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 9145803891..cff353a294 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -168,7 +168,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 typedef void (*LowBDResize_x_Func)(const uint8_t *const input, int in_stride,
                                    uint8_t *intbuf, int height,
-                                   int filteredlength, int width2);
+                                   int filtered_length, int width2);
 
 typedef tuple<LowBDResize_x_Func, FrameDimension> Resize_x_TestParams;
 
-- 
GitLab


From b930f29247cd68d1f4dca7789544a162b2ac358c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 4 Jun 2024 16:47:06 -0700
Subject: [PATCH 189/391] Include "config/aom_config.h"

Fix the following ClangTidy misc-include-cleaner warnings:
  no header providing "HAVE_NEON_DOTPROD" is directly included
  no header providing "HAVE_NEON_I8MM" is directly included

Change-Id: Id218cfa81117045d3606697929cf7224279c5c13
---
 test/av1_convolve_scale_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index c7debe27ab..b6458b0ef9 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -14,6 +14,7 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
+#include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/aom_timer.h"
-- 
GitLab


From bdeca636bd53eb53f76e2db7c9e2664d4bd7b88d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 4 Jun 2024 16:57:48 -0700
Subject: [PATCH 190/391] Fix ClangTidy misc-include-cleaner warnings

Fix the following warnings:
  no header providing "ROUND0_BITS" is directly included
  no header providing "uint8_t" is directly included
  no header providing "int16_t" is directly included
  no header providing "DECLARE_ALIGNED" is directly included
  no header providing "int32_t" is directly included
  no header providing "FILTER_BITS" is directly included
  no header providing "SCALE_SUBPEL_BITS" is directly included
  no header providing "ptrdiff_t" is directly included
  no header providing "SUBPEL_TAPS" is directly included
  no header providing "SCALE_SUBPEL_MASK" is directly included
  no header providing "SCALE_EXTRA_BITS" is directly included
  no header providing "InterpFilterParams" is directly included
  no header providing "ConvolveParams" is directly included
  no header providing "MAX_SB_SIZE" is directly included
  no header providing "MAX_FILTER_TAP" is directly included
  no header providing "CONV_BUF_TYPE" is directly included
  no header providing "UNLIKELY" is directly included

Change-Id: I2e59ef429e629f5c72b44ace3aaa2761982e4ae7
---
 av1/common/arm/av1_convolve_scale_neon_dotprod.c | 8 ++++++++
 av1/common/arm/av1_convolve_scale_neon_i8mm.c    | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
index 7c8eaa7b44..70ae88cf1f 100644
--- a/av1/common/arm/av1_convolve_scale_neon_dotprod.c
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -11,13 +11,21 @@
 
 #include <assert.h>
 #include <arm_neon.h>
+#include <stddef.h>
+#include <stdint.h>
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
 #include "av1/common/arm/convolve_scale_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
 
 // clang-format off
 DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
index 42de38f461..fe94c84f3e 100644
--- a/av1/common/arm/av1_convolve_scale_neon_i8mm.c
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -11,13 +11,21 @@
 
 #include <assert.h>
 #include <arm_neon.h>
+#include <stddef.h>
+#include <stdint.h>
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
 #include "av1/common/arm/convolve_scale_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
 
 // clang-format off
 DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
-- 
GitLab


From 2badbae61d6ee0996152d640f1e1896c0a0548ab Mon Sep 17 00:00:00 2001
From: Vignesh Venkat <vigneshv@google.com>
Date: Tue, 4 Jun 2024 14:20:00 -0700
Subject: [PATCH 191/391] av1_cx_iface: Do no require timebase.num <
 timebase.den

This has been in libvpx for a while now.
VP9: https://chromium-review.googlesource.com/c/webm/libvpx/+/332449
VP8: https://chromium-review.googlesource.com/c/webm/libvpx/+/274107

Generally, we set the timebase to be the inverse of the desired frame
rate. So this change allows us to express that using timebase (for
e.g. a frame rate of 0.5 fps will be 2/1).

Tested with an input video of 0.5 fps and the output is as intended
with this change (it fails before this change).

Bug: aomedia:3580
Change-Id: I296620f7625824ff460aa503388ed821c7e6ddd0
---
 av1/av1_cx_iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index c26f2aafb0..690d95927a 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -671,7 +671,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
     ERROR("max_frame_area out of range [..2^30]");
   }
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+  RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
 
   RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000);
-- 
GitLab


From a04797c8990cfe10143708a92eac53d1d0407821 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 5 Jun 2024 13:50:25 -0700
Subject: [PATCH 192/391] Use the AOMMIN macro

Change-Id: Ica4500ca06fcfd2f376973e9530649af58906c1c
---
 av1/encoder/pass2_strategy.c | 6 +++---
 av1/encoder/ratectrl.c       | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 6b63afc399..eca49c0621 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -18,8 +18,10 @@
 /*! @} - end defgroup gf_group_algo */
 
 #include <assert.h>
+#include <limits.h>
 #include <stdint.h>
 
+#include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
@@ -3408,9 +3410,7 @@ static int get_section_target_bandwidth(AV1_COMP *cpi) {
     section_target_bandwidth = rc->avg_frame_bandwidth;
   else {
     section_target_bandwidth = twopass->bits_left / frames_left;
-    section_target_bandwidth = (section_target_bandwidth < INT_MAX)
-                                   ? section_target_bandwidth
-                                   : INT_MAX;
+    section_target_bandwidth = AOMMIN(section_target_bandwidth, INT_MAX);
   }
   return (int)section_target_bandwidth;
 }
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 148334aa8a..8060a8ba2f 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2641,7 +2641,7 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
         AOMMIN(fast_extra_bits,
                AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
 #endif
-    fast_extra_bits = (fast_extra_bits < INT_MAX) ? fast_extra_bits : INT_MAX;
+    fast_extra_bits = AOMMIN(fast_extra_bits, INT_MAX);
     if (fast_extra_bits > 0) {
       // Update frame_target only if additional bits are available from
       // local undershoot.
@@ -2656,7 +2656,7 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   }
 
   // Clamp the target for the frame to the maximum allowed for one frame.
-  *this_frame_target = (int)((frame_target < INT_MAX) ? frame_target : INT_MAX);
+  *this_frame_target = (int)AOMMIN(frame_target, INT_MAX);
 }
 
 void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
-- 
GitLab


From 0078f3e0bf01e55ef5942192ce827cc941dbf78c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 3 Jun 2024 12:49:06 -0700
Subject: [PATCH 193/391] Move the SSSE3 code in variance_sse2.c to new file

Name the new file variance_ssse3.c. variance_sse2.c is now SSE2 only.

Bug: aomedia:3578
Change-Id: I9d75b6617dac011de7dcd0366fb9f8e5999ec133
---
 aom_dsp/aom_dsp.cmake        |   1 +
 aom_dsp/x86/variance_sse2.c  | 198 ---------------------------------
 aom_dsp/x86/variance_ssse3.c | 208 +++++++++++++++++++++++++++++++++++
 3 files changed, 209 insertions(+), 198 deletions(-)
 create mode 100644 aom_dsp/x86/variance_ssse3.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 6d8e5a961b..750df42641 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -267,6 +267,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index e71244f1c7..610695af97 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -403,204 +403,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int aom_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2, NULL, NULL);                                               \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
-  }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
-      const uint8_t *sec) {                                                  \
-    /*Avoid overflow in helper by capping height.*/                          \
-    const int hf = AOMMIN(h, 64);                                            \
-    unsigned int sse = 0;                                                    \
-    int se = 0;                                                              \
-    for (int i = 0; i < (w / wf); ++i) {                                     \
-      const uint8_t *src_ptr = src;                                          \
-      const uint8_t *dst_ptr = dst;                                          \
-      const uint8_t *sec_ptr = sec;                                          \
-      for (int j = 0; j < (h / hf); ++j) {                                   \
-        unsigned int sse2;                                                   \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
-            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
-        dst_ptr += hf * dst_stride;                                          \
-        src_ptr += hf * src_stride;                                          \
-        sec_ptr += hf * w;                                                   \
-        se += se2;                                                           \
-        sse += sse2;                                                         \
-      }                                                                      \
-      src += wf;                                                             \
-      dst += wf;                                                             \
-      sec += wf;                                                             \
-    }                                                                        \
-    *sse_ptr = sse;                                                          \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
-  }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
new file mode 100644
index 0000000000..d616f43fdf
--- /dev/null
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt)                                                           \
+  int aom_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                        \
+  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN
-- 
GitLab


From 525f35443198f0ffde71ad6e1a447e33e52c7772 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 29 Apr 2024 13:29:14 -0700
Subject: [PATCH 194/391] Ensure thread stack size is at least 256 KB

Fixes cases like musl where the default is lower:
https://wiki.musl-libc.org/functional-differences-from-glibc.html#Thread-stack-size

Bug: aomedia:2754, aomedia:3567
Change-Id: Ia6e211f9b87bc2efe376e7b9f4adb11741850b18
(cherry picked from commit ad5fd34ad9058384a55196f66e2001cc8c2c523f)
---
 aom_util/aom_thread.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c
index fa3b0a25e4..14f19e59cd 100644
--- a/aom_util/aom_thread.c
+++ b/aom_util/aom_thread.c
@@ -152,16 +152,18 @@ static int reset(AVxWorker *const worker) {
       // See: https://crbug.com/aomedia/3379
 #if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
     !defined(NDEBUG)
+    const size_t kMinStackSize = 1024 * 1024;
+#else
+    const size_t kMinStackSize = 256 * 1024;
+#endif
     size_t stacksize;
     if (!pthread_attr_getstacksize(&attr, &stacksize)) {
-      const size_t kMinStackSize = 1 << 20;  // 1 MiB
       if (stacksize < kMinStackSize &&
           pthread_attr_setstacksize(&attr, kMinStackSize)) {
         pthread_attr_destroy(&attr);
         goto Error2;
       }
     }
-#endif
     pthread_mutex_lock(&worker->impl_->mutex_);
     ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
     if (ok) worker->status_ = OK;
-- 
GitLab


From aaccabe09284727775fb051b66505da04615d693 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 6 Jun 2024 15:12:59 -0700
Subject: [PATCH 195/391] Define pthread_attr_getstacksize/setstacksize

Bug: aomedia:2754, aomedia:3567
Change-Id: I54d9e3e8c1253c9cd9624b3732f4ec0a69d76529
---
 aom_util/aom_thread.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h
index ec2ea43491..0e469c0bfd 100644
--- a/aom_util/aom_thread.h
+++ b/aom_util/aom_thread.h
@@ -71,6 +71,20 @@ static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
   return 0;
 }
 
+static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
+                                            size_t *stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
+static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
+                                            size_t stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
 static INLINE int pthread_create(pthread_t *const thread,
                                  const pthread_attr_t *attr,
                                  unsigned int(__stdcall *start)(void *),
-- 
GitLab


From 17aff846ba539ea287ec2b3ad12a17ac126e715e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 6 Jun 2024 16:32:50 -0700
Subject: [PATCH 196/391] Remove the double-check in multiply_and_scale()

Revert commit 72b1f1d. It has been three months.

Bug: b:319140742
Bug: oss-fuzz:66474
Change-Id: I862715e70d47666333e36565e99447e073019856
---
 av1/encoder/pickrst.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 0b0ca1c8e0..f60499418b 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1127,15 +1127,6 @@ static INLINE int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) {
   // Let y = x * w / WIENER_TAP_SCALE_FACTOR
   //       = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR
   const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR;
-  // Double-check the calculation using __int128.
-  // TODO(wtc): Remove after 2024-04-30.
-#if !defined(NDEBUG) && defined(__GNUC__) && defined(__LP64__)
-  const int32_t w = w1 * WIENER_TAP_SCALE_FACTOR + w2;
-  const __int128 z = (__int128)x * w / WIENER_TAP_SCALE_FACTOR;
-  assert(z >= INT64_MIN);
-  assert(z <= INT64_MAX);
-  assert(y == (int64_t)z);
-#endif
   return y;
 }
 
-- 
GitLab


From 759713613a9205ffc200ddf2aca3b50e33f4a0c6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 14 May 2024 17:54:10 -0700
Subject: [PATCH 197/391] update codec config after svc/scale controls

This ensures the encoder state/allocations stay in sync with scaling and
svc layer changes. In the SVC case, depending on the resolution,
differences in the chosen superblock size among layers may have caused a
crash. This was reproducible in WebRTC in screen content mode.

The fix is based on a change by Yuan Tong (tongyuan200097) [1]. It
refreshes the encoder config after AOME_SET_SCALEMODE,
AOME_SET_NUMBER_SPATIAL_LAYERS and AV1E_SET_SVC_PARAMS if no frames have
been encoded. AV1E_SET_SVC_PARAMS was missed in the original change.

[1]: https://aomedia-review.googlesource.com/c/aom/+/171941/2

Bug: chromium:339877165
Change-Id: Ib3d2a123b159898d7c7e19c81e89ff148920e1f1
(cherry picked from commit e42f4b1980bbbc772aa886d8b43a885461d7b89e)
---
 av1/av1_cx_iface.c | 99 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 1175a32ef6..4d5992460f 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -1598,22 +1598,27 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) {
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
+  bool is_sb_size_changed = false;
+  av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+  for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+    av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                      is_sb_size_changed);
+  }
+  if (ctx->ppi->cpi_lap != NULL) {
+    av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
+  }
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
                                         const struct av1_extracfg *extra_cfg) {
   const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
-    bool is_sb_size_changed = false;
-    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
-    for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
-      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
-                        is_sb_size_changed);
-    }
-    if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
-    }
+    return update_encoder_cfg(ctx);
   }
   return res;
 }
@@ -3533,11 +3538,23 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
   aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
 
   if (mode) {
-    const int res = av1_set_internal_size(
-        &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
-        mode->h_scaling_mode, mode->v_scaling_mode);
-    av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
-    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+    AV1EncoderConfig *const oxcf =
+        ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf;
+    const int res =
+        av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params,
+                              mode->h_scaling_mode, mode->v_scaling_mode);
+    if (res == 0) {
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      if (ctx->ppi->seq_params_locked) {
+        av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+        return AOM_CODEC_OK;
+      }
+      return update_encoder_cfg(ctx);
+    }
+    return AOM_CODEC_INVALID_PARAM;
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -3558,6 +3575,13 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
   if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
   ctx->ppi->number_spatial_layers = number_spatial_layers;
+  // update_encoder_cfg() is somewhat costly and this control may be called
+  // multiple times, so update_encoder_cfg() is only called to ensure frame and
+  // superblock sizes are updated before they're fixed by the first encode
+  // call.
+  if (!ctx->ppi->seq_params_locked) {
+    return update_encoder_cfg(ctx);
+  }
   return AOM_CODEC_OK;
 }
 
@@ -3575,8 +3599,6 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   AV1_PRIMARY *const ppi = ctx->ppi;
   AV1_COMP *const cpi = ppi->cpi;
-  AV1_COMMON *const cm = &cpi->common;
-  AV1EncoderConfig *oxcf = &cpi->oxcf;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
   int64_t target_bandwidth = 0;
   ppi->number_spatial_layers = params->number_spatial_layers;
@@ -3616,19 +3638,38 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
           target_bandwidth += lc->layer_target_bitrate;
       }
     }
-    if (cm->current_frame.frame_number == 0) {
-      if (!cpi->ppi->seq_params_locked) {
-        SequenceHeader *const seq_params = &ppi->seq_params;
-        seq_params->operating_points_cnt_minus_1 =
-            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
-        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
-      }
+
+    if (ppi->seq_params_locked) {
+      AV1EncoderConfig *const oxcf = &cpi->oxcf;
+      // Keep ctx->oxcf in sync in case further codec controls are made prior
+      // to encoding.
+      ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth =
+          target_bandwidth;
+      set_primary_rc_buffer_sizes(oxcf, ppi);
+      av1_update_layer_context_change_config(cpi, target_bandwidth);
+      check_reset_rc_flag(cpi);
+    } else {
+      // Note av1_init_layer_context() relies on cpi->oxcf. The order of that
+      // call and the ones in the other half of this block (which
+      // update_encoder_cfg() transitively makes) is important. So we keep
+      // ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will
+      // overwrite cpi->oxcf with ctx->oxcf.
+      ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth =
+          target_bandwidth;
+      SequenceHeader *const seq_params = &ppi->seq_params;
+      seq_params->operating_points_cnt_minus_1 =
+          ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+
       av1_init_layer_context(cpi);
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      return update_encoder_cfg(ctx);
     }
-    oxcf->rc_cfg.target_bandwidth = target_bandwidth;
-    set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
-    av1_update_layer_context_change_config(cpi, target_bandwidth);
-    check_reset_rc_flag(cpi);
+  } else if (!ppi->seq_params_locked) {
+    // Ensure frame and superblock sizes are updated.
+    return update_encoder_cfg(ctx);
   }
   av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
   return AOM_CODEC_OK;
-- 
GitLab


From 3a15245e13e347e2016d1077f9d3cd9a62e76c4e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 16 May 2024 13:44:52 -0700
Subject: [PATCH 198/391] encode_api_test: add repro for chromium 339877165

BUG=chromium:339877165

Change-Id: I69dcc2cda098ec96a34e1e5f7ef557ee8caf5521
(cherry picked from commit 01467cdbd524900eed283660836179fd1b2cd536)
---
 test/encode_api_test.cc | 141 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 9a447b55a9..b48c5a2482 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -556,6 +556,147 @@ TEST(EncodeAPI, Buganizer310457427) {
   encoder.Encode(false);
 }
 
+// Reproduces https://crbug.com/339877165.
+TEST(EncodeAPI, Buganizer339877165) {
+  // Initialize libaom encoder.
+  aom_codec_iface_t *const iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+            AOM_CODEC_OK);
+
+  cfg.g_w = 2560;
+  cfg.g_h = 1600;
+  cfg.rc_target_bitrate = 231;
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.g_threads = 8;
+
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  // From libaom_av1_encoder.cc in WebRTC.
+  ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 11), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_CDEF, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_TPL_MODEL, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_ORDER_HINT, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_AQ_MODE, 3), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AOME_SET_MAX_INTRA_BITRATE_PCT, 300),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_COEFF_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MODE_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MV_COST_UPD_FREQ, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_PALETTE, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 1), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 2), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_OBMC, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_NOISE_SENSITIVITY, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_WARPED_MOTION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_GLOBAL_MOTION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_REF_FRAME_MVS, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SUPERBLOCK_SIZE,
+                              AOM_SUPERBLOCK_SIZE_DYNAMIC),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_CFL_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_SMOOTH_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_ANGLE_DELTA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_FILTER_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DISABLE_TRELLIS_QUANT, 1),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DIST_WTD_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DIFF_WTD_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_DUAL_FILTER, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTERINTRA_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTERINTRA_WEDGE, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTRA_EDGE_FILTER, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_INTRABC, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_MASKED_COMP, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_PAETH_INTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_QM, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_RECT_PARTITIONS, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_RESTORATION, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_SMOOTH_INTERINTRA, 0),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_ENABLE_TX64, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_MAX_REFERENCE_FRAMES, 3),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK);
+
+  aom_svc_params_t svc_params = {};
+  svc_params.number_spatial_layers = 2;
+  svc_params.number_temporal_layers = 1;
+  svc_params.max_quantizers[0] = svc_params.max_quantizers[1] = 56;
+  svc_params.min_quantizers[0] = svc_params.min_quantizers[1] = 10;
+  svc_params.scaling_factor_num[0] = svc_params.scaling_factor_num[1] = 1;
+  svc_params.scaling_factor_den[0] = 2;
+  svc_params.scaling_factor_den[1] = 1;
+  svc_params.layer_target_bitrate[0] = cfg.rc_target_bitrate;
+  svc_params.framerate_factor[0] = 1;
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params),
+            AOM_CODEC_OK);
+
+  aom_svc_layer_id_t layer_id = {};
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id),
+            AOM_CODEC_OK);
+
+  aom_svc_ref_frame_config_t ref_frame_config = {};
+  ref_frame_config.refresh[0] = 1;
+  ASSERT_EQ(
+      aom_codec_control(&enc, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config),
+      AOM_CODEC_OK);
+
+  // Create input image.
+  aom_image_t *const image =
+      CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode layer 0.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+  layer_id.spatial_layer_id = 1;
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id),
+            AOM_CODEC_OK);
+
+  ref_frame_config.refresh[0] = 0;
+  ASSERT_EQ(
+      aom_codec_control(&enc, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config),
+      AOM_CODEC_OK);
+
+  // Encode layer 1.
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+  // Free resources.
+  aom_img_free(image);
+  aom_codec_destroy(&enc);
+}
+
 class EncodeAPIParameterized
     : public testing::TestWithParam<std::tuple<
           /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
-- 
GitLab


From 91001f5f377230337e40df045dbeb7babd703629 Mon Sep 17 00:00:00 2001
From: Yuan Tong <tongyuan200097@gmail.com>
Date: Wed, 8 Mar 2023 16:29:52 +0800
Subject: [PATCH 199/391] Update progressive test to catch more crash case

Update AVIFProgressiveTest.DimensionChangeLargeImageMultiThread to also catch the crash fixed by Ib3d2a123b159898d7c7e19c81e89ff148920e1f1.

BUG: aomedia:3382
Change-Id: I5a65578c7793fdac96c2d41cd71d63a75f7b0d1d
(cherry picked from commit 00392c6223ac9aefc29e0d67929a943836ad8daf)
---
 test/avif_progressive_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index 2a28ca368b..59aebd486f 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -225,8 +225,6 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   aom_codec_ctx_t enc;
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 31));
-  EXPECT_EQ(AOM_CODEC_OK,
-            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AV1E_SET_ROW_MT, 1));  // MultiThread
@@ -234,6 +232,8 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
             aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
 
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
-- 
GitLab


From 9dd7563b4b1db7da44baaf4a94b8c8d1c7135aad Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 16 May 2024 14:19:09 -0700
Subject: [PATCH 200/391] Add the DimensionChangeBigImageMultiThread2 test

It is a variant of the DimensionChangeBigImageMultiThread test. The only
difference is that it doesn't have the spatial layers.

This test passes after James Zern's commit e42f4b1980:
https://aomedia-review.googlesource.com/c/aom/+/190181

Bug: aomedia:3382
Change-Id: Ic21e3a71645ac96ebda0b1f2bdcbf709b8f079d5
(cherry picked from commit 6e3e2227eae988a7639d251d042c4fabb7db54d1)
---
 test/avif_progressive_test.cc | 91 ++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index 59aebd486f..f3e2ef2af9 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -25,7 +25,7 @@ namespace {
 TEST(AVIFProgressiveTest, QualityChange) {
   constexpr int kWidth = 256;
   constexpr int kHeight = 256;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 3 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -110,7 +110,7 @@ TEST(AVIFProgressiveTest, QualityChange) {
 TEST(AVIFProgressiveTest, DimensionChange) {
   constexpr int kWidth = 256;
   constexpr int kHeight = 256;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 3 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -151,7 +151,7 @@ TEST(AVIFProgressiveTest, DimensionChange) {
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
-  aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
@@ -192,10 +192,10 @@ TEST(AVIFProgressiveTest, DimensionChange) {
 // This test reproduces bug aomedia:3382. Certain parameters such as width,
 // height, g_threads, usage, etc. were carefully chosen based on the
 // complicated logic of av1_select_sb_size() to cause an inconsistent sb_size.
-TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
+TEST(AVIFProgressiveTest, DimensionChangeBigImageMultiThread) {
   constexpr int kWidth = 1920;
   constexpr int kHeight = 1080;
-  // Dummy buffer of neutral gray samples.
+  // A buffer of neutral gray samples.
   constexpr size_t kBufferSize = 2 * kWidth * kHeight;
   std::vector<unsigned char> buffer(kBufferSize,
                                     static_cast<unsigned char>(128));
@@ -238,7 +238,7 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   // First frame (layer 0)
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
-  aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
   EXPECT_EQ(AOM_CODEC_OK,
             aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
@@ -276,4 +276,83 @@ TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+// A variant of the previous test, without the spatial layers.
+TEST(AVIFProgressiveTest, DimensionChangeBigImageMultiThread2) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+  // A buffer of neutral gray samples.
+  constexpr size_t kBufferSize = 2 * kWidth * kHeight;
+  std::vector<unsigned char> buffer(kBufferSize,
+                                    static_cast<unsigned char>(128));
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+  img.cp = AOM_CICP_CP_UNSPECIFIED;
+  img.tc = AOM_CICP_TC_UNSPECIFIED;
+  img.mc = AOM_CICP_MC_UNSPECIFIED;
+  img.range = AOM_CR_FULL_RANGE;
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+  cfg.g_profile = 0;
+  cfg.g_w = img.w;
+  cfg.g_h = img.h;
+  cfg.g_bit_depth = AOM_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_lag_in_frames = 0;
+  cfg.g_threads = 2;  // MultiThread
+  cfg.rc_end_usage = AOM_Q;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 63;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 31));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_ROW_MT, 1));  // MultiThread
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+  // First frame
+  const aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  aom_codec_iter_t iter = nullptr;
+  const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0x1f0011.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Second frame
+  aom_enc_frame_flags_t encode_flags =
+      AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+      AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  ASSERT_NE(pkt, nullptr);
+  EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+  // pkt->data.frame.flags is 0.
+  EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  // Flush encoder
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+  iter = nullptr;
+  pkt = aom_codec_get_cx_data(&enc, &iter);
+  EXPECT_EQ(pkt, nullptr);
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
 }  // namespace
-- 
GitLab


From 43500614aad8a2e3d740ca5b3f1451f82046e238 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 29 May 2024 13:26:24 -0700
Subject: [PATCH 201/391] av1_block_error_lp_neon: fix block_size param type

int -> intptr_t. This fixes a Control Flow Integrity (CFI) sanitizer
failure.

This also fixes a -Wmissing-prototypes warning.

This is a port of a broader change that contained the fix:
b44333201b *_neon.c: add missing rtcd includes & CONFIG check

Bug: aomedia:3416
Change-Id: I2e6980fba33631f5bb612d40dfc83b6f2527fe4b
(cherry picked from commit bfd5fa58ddf93f7a716472cddd4fdd9930bd2525)
---
 av1/encoder/arm/neon/av1_error_neon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/neon/av1_error_neon.c
index 7d24c7d7af..84c896736e 100644
--- a/av1/encoder/arm/neon/av1_error_neon.c
+++ b/av1/encoder/arm/neon/av1_error_neon.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/mem_neon.h"
@@ -60,7 +61,7 @@ int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 }
 
 int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
+                                intptr_t block_size) {
   int64x2_t error = vdupq_n_s64(0);
 
   assert(block_size >= 8);
-- 
GitLab


From bc47c3707b520b25db4f13e14dae1f4692beeb77 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 30 Apr 2024 12:52:42 -0700
Subject: [PATCH 202/391] noise_model_test.cc: fix -Wc++20-extensions warning

Add an empty fourth argument to INSTANTIATE_TYPED_TEST_SUITE_P().

Fixes:
  aom/test/noise_model_test.cc:536:49: warning: passing no argument for
  the '...' parameter of a variadic macro is a C++20 extension

Change-Id: Id1457ad67a101502f6b811eacfaf483dacd27848
(cherry picked from commit b736e96c15b3efe643a82d394b20c6d44fd225b6)
---
 test/noise_model_test.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index b3edcc218e..87f607c155 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -532,8 +532,10 @@ typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
                          BitDepthParams<uint16_t, 10, true>,  // highbd data
                          BitDepthParams<uint16_t, 12, true> >
     AllBitDepthParams;
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
-                               AllBitDepthParams);
+                               AllBitDepthParams, );
 
 template <typename T>
 class NoiseModelUpdateTest : public ::testing::Test, public T {
@@ -968,8 +970,10 @@ REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
                             NoiseStrengthChangeSignalsDifferentNoiseType,
                             NoiseCoeffsSignalsDifferentNoiseType);
 
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation,
-                               NoiseModelUpdateTest, AllBitDepthParams);
+                               NoiseModelUpdateTest, AllBitDepthParams, );
 
 TEST(NoiseModelGetGrainParameters, TestLagSize) {
   aom_film_grain_t film_grain;
@@ -1368,5 +1372,7 @@ TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
 REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize,
                             InvalidChromaSubsampling, GradientTest);
 
+// Note the empty final argument can be removed if C++20 is made the minimum
+// requirement.
 INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
-                               AllBitDepthParams);
+                               AllBitDepthParams, );
-- 
GitLab


From d31978b8850078a31ec4fc21406020cd76bb6c40 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 30 Apr 2024 16:20:47 -0700
Subject: [PATCH 203/391] common/tools_common.h: port f{seek,tell}o fix from
 libvpx

https://chromium-review.googlesource.com/c/webm/libvpx/+/5074786
bf0755418 Add the needed Android API level predicates.

Bug: aomedia:3561
Change-Id: Ie5c4b3134f3842cd55e5b07e22dffa4ba2584ea8
(cherry picked from commit f4eaf8b55e58102c3f9d2bab7658b9f6063ad400)
---
 common/tools_common.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/common/tools_common.h b/common/tools_common.h
index b31371c670..9d891d1561 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -37,8 +37,13 @@ typedef int64_t FileOffset;
 #define fseeko fseeko64
 #define ftello ftello64
 typedef off64_t FileOffset;
-#elif CONFIG_OS_SUPPORT
-#include <sys/types.h> /* NOLINT*/
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
+#include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM
  * executables (.axf) with RVCT. */
-- 
GitLab


From 8e58f5b7f421961b83dceb07c4051c6d6dd424c1 Mon Sep 17 00:00:00 2001
From: Vignesh Venkat <vigneshv@google.com>
Date: Tue, 4 Jun 2024 14:20:00 -0700
Subject: [PATCH 204/391] av1_cx_iface: Do no require timebase.num <
 timebase.den

This has been in libvpx for a while now.
VP9: https://chromium-review.googlesource.com/c/webm/libvpx/+/332449
VP8: https://chromium-review.googlesource.com/c/webm/libvpx/+/274107

Generally, we set the timebase to be the inverse of the desired frame
rate. So this change allows us to express that using timebase (for
e.g. a frame rate of 0.5 fps will be 2/1).

Tested with an input video of 0.5 fps and the output is as intended
with this change (it fails before this change).

Bug: aomedia:3580
Change-Id: I296620f7625824ff460aa503388ed821c7e6ddd0
(cherry picked from commit 2badbae61d6ee0996152d640f1e1896c0a0548ab)
---
 av1/av1_cx_iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 4d5992460f..95f0ce827d 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -659,7 +659,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
     ERROR("max_frame_area out of range [..2^30]");
   }
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+  RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
-- 
GitLab


From 318d3ba463f200ecfc9c246b0aaeca3cfa6578d8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 7 Jun 2024 10:06:46 -0700
Subject: [PATCH 205/391] Update CHANGELOG, etc. for libaom v3.8.3

Bug: aomedia:3581
Change-Id: I8dcdbcab302f226b758433e7bae720f5d1b31d8e
---
 .mailmap       |  1 +
 CHANGELOG      | 16 ++++++++++++++++
 CMakeLists.txt |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index 6d6e6302bc..7ddc582d07 100644
--- a/.mailmap
+++ b/.mailmap
@@ -98,6 +98,7 @@ Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
 Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
 Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
 Wei-Ting Lin <weitinglin@google.com>
 Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
diff --git a/CHANGELOG b/CHANGELOG
index c95ac37460..35737c1efe 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,19 @@
+2024-06-07 v3.8.3
+  This release includes several bug fixes. This release is ABI
+  compatible with the last release. See
+  https://aomedia.googlesource.com/aom/+log/v3.8.2..v3.8.3 for all the
+  commits in this release.
+
+  - Bug Fixes
+    * aomedia:2754, aomedia:3567: Ensure thread stack size is at least
+      256 KB
+    * aomedia:3382, chromium:339877165: update codec config after
+      svc/scale controls (CVE-2024-5493)
+    * aomedia:3561: libaom-3.8.2 armv7 Android build failed
+    * aomedia:3580: Allow g_timebase.num to be greater than
+      g_timebase.den
+    * av1_block_error_lp_neon: fix block_size param type
+
 2024-03-08 v3.8.2
   This release includes several bug fixes. This release is ABI
   compatible with the last release. See
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf0776e3a3..4674396ed1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,7 @@ endif()
 #
 # We set SO_FILE_VERSION = [c-a].a.r
 set(LT_CURRENT 11)
-set(LT_REVISION 2)
+set(LT_REVISION 3)
 set(LT_AGE 8)
 math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
 set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
-- 
GitLab


From 49b7dcd627a0c00f62b8db23320f382d296b4637 Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Wed, 1 May 2024 00:45:41 +0300
Subject: [PATCH 206/391] cpu.cmake: Do more elaborate test of whether SVE can
 be compiled

For Windows targets, Clang will successfully compile simpler
SVE functions, but if the function requires backing up and restoring
SVE registers (as part of the AAPCS calling convention), Clang
will fail to generate unwind data for this function, resulting
in an error.

This issue is tracked upstream in Clang in
https://github.com/llvm/llvm-project/issues/80009.

Check whether the compiler can compile such a function, and
disable SVE if it is unable to handle that case.

Change-Id: I307d7398cedd1942c39ef034431a51696264ff47
(cherry picked from commit 5ccdc66ab6eb8eb300eda854fab4ff250b2c2f92)
---
 build/cmake/cpu.cmake | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index bd13d035d5..776f6af691 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -55,8 +55,18 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
 #endif
 #include <arm_sve.h>
 #include <arm_neon_sve_bridge.h>" HAVE_SVE_HEADERS)
+    # Check whether the compiler can compile SVE functions that require
+    # backup/restore of SVE registers according to AAPCS. Clang for Windows used
+    # to fail this, see https://github.com/llvm/llvm-project/issues/80009.
+    aom_check_source_compiles("arm_sve_preserve" "
+#include <arm_sve.h>
+void other(void);
+svfloat32_t func(svfloat32_t a) {
+  other();
+  return a;
+}" CAN_COMPILE_SVE)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
-    if(HAVE_SVE_HEADERS EQUAL 0)
+    if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0)
       set(ENABLE_SVE 0)
     endif()
   endif()
-- 
GitLab


From 5f8424b269c11ee3b0cc777e53bcf6ecbca1e547 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sat, 4 May 2024 13:20:42 +0100
Subject: [PATCH 207/391] cpu.cmake: Address issues in SVE feature tests

A test to check that SVE registers were correctly handled as function
parameters was added in 5ccdc66ab6eb8eb300eda854fab4ff250b2c2f92,
however this appears to have a couple of issues:

* Semicolons need to be escaped, else the compiler fails to compile due
  to invalid syntax. We can fix this by prefixing each semicolon with a
  backslash.

* The "other" function does not have a definition so the test program
  will always fail to link even if it compiles to an object file. We can
  work around this by instructing CMake to only try compiling up to a
  static library rather than a full executable.

Change-Id: Ic37280d4b42b9031e68bed8a4b24c0eb51491827
(cherry picked from commit fb21617c1f3ef49795597e006b68adfba6e54be0)
---
 build/cmake/cpu.cmake | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 776f6af691..1fa934bbf8 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -48,7 +48,9 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
   # SVE requires that the Neon-SVE bridge header is also available.
   if(ENABLE_SVE)
     set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE})
     set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}")
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
     aom_check_source_compiles("arm_neon_sve_bridge_available" "
 #ifndef __ARM_NEON_SVE_BRIDGE
 #error 1
@@ -60,12 +62,13 @@ if("${AOM_TARGET_CPU}" STREQUAL "arm64")
     # to fail this, see https://github.com/llvm/llvm-project/issues/80009.
     aom_check_source_compiles("arm_sve_preserve" "
 #include <arm_sve.h>
-void other(void);
+void other(void)\;
 svfloat32_t func(svfloat32_t a) {
-  other();
-  return a;
+  other()\;
+  return a\;
 }" CAN_COMPILE_SVE)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE})
     if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0)
       set(ENABLE_SVE 0)
     endif()
-- 
GitLab


From 2f571cd9fcc6ac8186564f2443f1b3461231ad79 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 7 Jun 2024 14:56:50 -0700
Subject: [PATCH 208/391] Update CHANGELOG again for libaom v3.8.3

Bug: aomedia:3581
Change-Id: I67eee1bfff0f779ec821408b5a7ca5839754e124
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index 35737c1efe..84bcba3086 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -12,6 +12,7 @@
     * aomedia:3561: libaom-3.8.2 armv7 Android build failed
     * aomedia:3580: Allow g_timebase.num to be greater than
       g_timebase.den
+    * Arm SVE build fixes.
     * av1_block_error_lp_neon: fix block_size param type
 
 2024-03-08 v3.8.2
-- 
GitLab


From 5e3c5febea2225be437d74bc5413eec1893f3ff8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 10 Jun 2024 15:26:10 -0700
Subject: [PATCH 209/391] Propagate return value of parse_operating_points()

In decoder_peek_si_internal(), if the parse_operating_points() call
fails, propagate its return value rather than return AOM_CODEC_ERROR.

Change-Id: I569c8ef3c2c0d09d442ea03c0c38e1b6c8672b25
---
 av1/av1_dx_iface.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 1a2dea37b6..18dc980f6f 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -310,10 +310,8 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
         return AOM_CODEC_UNSUP_BITSTREAM;
       }
 
-      if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
-          AOM_CODEC_OK) {
-        return AOM_CODEC_ERROR;
-      }
+      status = parse_operating_points(&rb, reduced_still_picture_hdr, si);
+      if (status != AOM_CODEC_OK) return status;
 
       int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
       int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
-- 
GitLab


From 569e9ec2a799f01355d5e94d1287d119cc024e42 Mon Sep 17 00:00:00 2001
From: Rachel Barker <rachelbarker@google.com>
Date: Wed, 6 Mar 2024 19:12:09 +0000
Subject: [PATCH 210/391] Enable global motion for speed 5 & speed 6

av1_resize_plane() optimizations have been completed recently. Now, as GM tool is turned on, a good coding gain (0.7% - 1.3%) is seen with a
moderate encoder time increase (~5%). The speed/quality tradeoff is good.

Here is the borg test result:
       avg_psnr: ovr_psnr: ssim: vmaf: encoding_time:
speed 5:
hdres2:  -0.770	-0.761	-0.713	-1.075	5.791
midres2: -1.329	-1.365	-1.297	-1.613	4.858
lowres2: -1.286	-1.316	-1.686	-1.100	3.866

speed 6:
hdres2:  -0.701	-0.719	-0.654	-0.922	5.327
midres2: -1.239	-1.306	-1.285	-1.622	4.922
lowres2: -1.277	-1.297	-1.634	-1.553	4.116

STATS_CHANGED

Change-Id: I9bb10ece68bcbacfc35b871a4a1162e5eaa8035b
---
 av1/encoder/speed_features.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index a65ac3091b..893749c88b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1238,8 +1238,6 @@ static void set_good_speed_features_framesize_independent(
 
     sf->fp_sf.reduce_mv_step_param = 4;
 
-    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
-
     sf->part_sf.simple_motion_search_prune_agg =
         allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
     sf->part_sf.ext_partition_eval_thresh =
-- 
GitLab


From d3d536e937f682abcfb30c7d3ce52d250abb419f Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 11 Jun 2024 10:52:44 -0700
Subject: [PATCH 211/391] rtc: Fix multi-threading settings for SVC datarate
 tests

Pass in tile_column_ and tile_row_ from each test with
multi-threading. This fixes the 4 thread test case where
the intention was to use 2x2 (tile_col x tile_row) config,
the current test was not doing that.

Also a add test for (1SL, 2TL) with 4 threads for more
coverage.

Change-Id: I910f290ad3705cfcee8b9a874e13e911243a3f54
---
 test/datarate_test.cc     |  4 +--
 test/datarate_test.h      |  9 ++++---
 test/svc_datarate_test.cc | 57 ++++++++++++++++++++++++++++++++-------
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index a75a72fab6..9b73f79aed 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -162,7 +162,7 @@ class DatarateTestLarge
     const int bitrate_array[2] = { 250, 650 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
-    tile_column_ = 2;
+    tile_columns_ = 2;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
               effective_datarate_ * 0.85)
@@ -354,7 +354,7 @@ class DatarateTestLarge
     const int bitrate_array[2] = { 250, 650 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
-    tile_column_ = 1;
+    tile_columns_ = 1;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
               effective_datarate_ * 0.85)
diff --git a/test/datarate_test.h b/test/datarate_test.h
index accc1ad86b..869c22150a 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -42,7 +42,8 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     bits_total_ = 0;
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
-    tile_column_ = 0;
+    tile_columns_ = 0;
+    tile_rows_ = 0;
     screen_mode_ = false;
     max_perc_spike_ = 1.0;
     max_perc_spike_high_ = 1.0;
@@ -62,7 +63,8 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_column_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+      encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
       encoder->Control(AV1E_SET_ROW_MT, 1);
       if (cfg_.g_usage == AOM_USAGE_REALTIME) {
         encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
@@ -203,7 +205,8 @@ class DatarateTest : public ::libaom_test::EncoderTest {
   int denoiser_offon_period_;
   unsigned int aq_mode_;
   bool speed_change_test_;
-  int tile_column_;
+  int tile_columns_;
+  int tile_rows_;
   bool screen_mode_;
   double max_perc_spike_;
   double max_perc_spike_high_;
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 28f795cf2a..16fbb0bd3e 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -118,15 +118,8 @@ class DatarateTestSVC
       encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
       encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
       if (cfg_.g_threads > 1) {
-        if (cfg_.g_threads == 4) {
-          encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
-          encoder->Control(AV1E_SET_TILE_ROWS, 2);
-        } else if (cfg_.g_threads == 8) {
-          encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
-          encoder->Control(AV1E_SET_TILE_ROWS, 2);
-        } else {
-          encoder->Control(AV1E_SET_TILE_COLUMNS, cfg_.g_threads >> 1);
-        }
+        encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+        encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
         encoder->Control(AV1E_SET_ROW_MT, 1);
       }
       if (screen_mode_) {
@@ -1575,6 +1568,8 @@ class DatarateTestSVC
     const int bitrate_array[2] = { 600, 1200 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
+    tile_columns_ = 1;
+    tile_rows_ = 0;
     set_speed_per_layer_ = true;
     number_temporal_layers_ = 3;
     number_spatial_layers_ = 3;
@@ -1618,6 +1613,8 @@ class DatarateTestSVC
     const int bitrate_array[2] = { 600, 1200 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
+    tile_columns_ = 1;
+    tile_rows_ = 0;
     number_temporal_layers_ = 3;
     number_spatial_layers_ = 3;
     // SL0
@@ -1644,6 +1641,37 @@ class DatarateTestSVC
     }
   }
 
+  virtual void BasicRateTargetingSVC2TL1SLHDMultiThread4Test() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_threads = 4;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    tile_columns_ = 1;
+    tile_rows_ = 1;
+    number_temporal_layers_ = 2;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingSVC3TL3SLHDMultiThread4Test() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -1660,6 +1688,8 @@ class DatarateTestSVC
     const int bitrate_array[2] = { 600, 1200 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
+    tile_columns_ = 1;
+    tile_rows_ = 1;
     number_temporal_layers_ = 3;
     number_spatial_layers_ = 3;
     // SL0
@@ -2504,8 +2534,15 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLMultiThreadSpeedPerLayer) {
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread2) {
   BasicRateTargetingSVC3TL3SLHDMultiThread2Test();
 }
+
+// Check basic rate targeting for CBR, for 1 spatial, 2 temporal layers,
+// for 4 threads, 2 tile_columns, 2 tiles_rows, row-mt enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLHDMultiThread4) {
+  BasicRateTargetingSVC2TL1SLHDMultiThread4Test();
+}
+
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
-// for 4 threads, 4 tile_columns, row-mt enabled.
+// for 4 threads, 2 tile_columns, 2 tiles_rows, row-mt enabled.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread4) {
   BasicRateTargetingSVC3TL3SLHDMultiThread4Test();
 }
-- 
GitLab


From 0f151b1b809e3cce940e88555b07d0e55a0fa0f1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Jun 2024 11:54:01 -0700
Subject: [PATCH 212/391] external_frame_buffer_test.cc: update copyright

This file was deleted in the initial fork and restored in 2018 with the
WebM copyright. Updating the copyright is acceptable as the author is a
google employee and aomedia member.

Bug: aomedia:3525
Change-Id: Ib0bcae3b72fad5ef10dc4a1c7c1a28b41d3bc87c
---
 test/external_frame_buffer_test.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 8f16c4e2d5..0bf0f6bdc6 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <memory>
-- 
GitLab


From 779af95bba1ecb9fc3efc731351c53d62090c25f Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 11 Jun 2024 14:17:20 -0700
Subject: [PATCH 213/391] rtc: Reset x->nonrd_prune_ref_frame_search to frame
 value

The superblock value of nonrd_prune_ref_frame_search
should be reset to the frame level value
(sf->rt_sf.nonrd_prune_ref_frame_search). This fixed
a non-bitexactness for multi-threading at speed 11,
observed in a manual run with this patch (not merged in):
https://aomedia-review.googlesource.com/c/aom/+/190884

Change-Id: I692477c7d6fef82e92a3b6cff478560f62521728
---
 av1/encoder/encodeframe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 4c178b18c0..46e92f8ede 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1212,6 +1212,8 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
     x->sb_me_mv.as_int = 0;
     x->sb_force_fixed_part = 1;
     x->color_palette_thresh = 64;
+    x->nonrd_prune_ref_frame_search =
+        cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
 
     if (cpi->oxcf.mode == ALLINTRA) {
       x->intra_sb_rdmult_modifier = 128;
-- 
GitLab


From d41d6652b66cad47d75e4c7ccbba45f72c35a38c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Jun 2024 16:13:19 -0700
Subject: [PATCH 214/391] av1_error_{neon,sve}.c: update copyright

av1_error_neon.c was imported from libvpx. As the author and member of
aomedia I can grant the change in license.
av1_error_sve.c copied the copyright from av1_error_neon.c, though it
was authored in this project.

Bug: aomedia:3525
Change-Id: I4182055d6752a7a64ad069c318e9546c92a0abf8
---
 av1/encoder/arm/av1_error_neon.c | 14 +++++++-------
 av1/encoder/arm/av1_error_sve.c  | 13 +++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/av1/encoder/arm/av1_error_neon.c b/av1/encoder/arm/av1_error_neon.c
index 1d4299fec9..8311546a89 100644
--- a/av1/encoder/arm/av1_error_neon.c
+++ b/av1/encoder/arm/av1_error_neon.c
@@ -1,12 +1,12 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/av1/encoder/arm/av1_error_sve.c b/av1/encoder/arm/av1_error_sve.c
index 5a1ad2f48e..60d368c1b5 100644
--- a/av1/encoder/arm/av1_error_sve.c
+++ b/av1/encoder/arm/av1_error_sve.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
-- 
GitLab


From c08b9808c3c91d21c9c06320db832ecf0f5a7b4c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 11 Jun 2024 23:30:03 -0700
Subject: [PATCH 215/391] rtc: Fix source_sad setting near boundary

Avoid using the src_sad_blk_64x64[] near the boundary
as this may not be set in the scene_detection under
certain conditions (like dropped frame or temporal layers).
So return curr_sb_sad = UINT64_MAX which triggers source_sad
calculation for that boundary block.

This prevents incorrectly setting the value
x->content_state_sb.source_sad_rd near the boundary,
which was causing the psnr regression for scroll clips,
where new text was coming in from the bottom boundary.

Bug: b/343429036
Change-Id: I0611b8f3e87fd05f6115570ab22bad8963f93d64
---
 av1/encoder/encodeframe.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 46e92f8ede..10fc611148 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1008,12 +1008,16 @@ static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
   const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
   const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
   uint64_t curr_sb_sad = UINT64_MAX;
+  // Avoid the border as sad_blk_64x64 may not be set for the border
+  // in the scene detection.
+  if ((blk_64x64_row_index >= num_blk_64x64_rows - 1) ||
+      (blk_64x64_col_index >= num_blk_64x64_cols - 1)) {
+    return curr_sb_sad;
+  }
   const uint64_t *const src_sad_blk_64x64_data =
       &cpi->src_sad_blk_64x64[blk_64x64_col_index +
                               blk_64x64_row_index * num_blk_64x64_cols];
-  if (cm->seq_params->sb_size == BLOCK_128X128 &&
-      blk_64x64_col_index + 1 < num_blk_64x64_cols &&
-      blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+  if (cm->seq_params->sb_size == BLOCK_128X128) {
     // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
     // superblock
     curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
-- 
GitLab


From 2518fbce8837412aaef92e9045eaefebe70cb468 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Jun 2024 18:07:22 -0700
Subject: [PATCH 216/391] av1_inv_txfm_neon.c: add missing prototype

For av1_lowbd_inv_txfm2d_add_neon(). This function is exposed to allow
testing in av1_inv_txfm2d_test.cc.

Bug: aomedia:3416
Change-Id: Ied05d3b6589c9c38ef14f204cea35294fb229cd7
---
 av1/common/arm/av1_inv_txfm_neon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index f15d473560..06168cd3f1 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -4177,6 +4177,11 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
   }
 }
 
+// This function is used by av1_inv_txfm2d_test.cc.
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob);
+
 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                    int eob) {
-- 
GitLab


From 4d4c425fa802fc0bc71adf2935b886b0d0ed2bc2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 10 May 2024 19:34:48 -0700
Subject: [PATCH 217/391] av1_rtcd_defs.pl: rm av1_highbd_inv_txfm_add_NxM

These functions are (mostly) unused outside of the file they're defined
in. Only the av1_highbd_inv_txfm_add() function is invoked directly.
This matches the setup for av1_inv_txfm_add().

Bug: aomedia:3416
Change-Id: Iccfede49f339d6428feb17426d5ab0a1f3eee24a
---
 av1/common/arm/highbd_inv_txfm_neon.c | 265 ++------------------------
 av1/common/av1_rtcd_defs.pl           |  37 ----
 av1/common/idct.c                     | 129 +++++++------
 av1/common/x86/highbd_inv_txfm_avx2.c |  19 +-
 av1/common/x86/highbd_inv_txfm_sse4.c |  36 ++--
 5 files changed, 107 insertions(+), 379 deletions(-)

diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index 84bc8fd963..70c09e7440 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -4965,120 +4965,6 @@ void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
   }
 }
 
-static void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
-                                            uint16_t *output, int stride,
-                                            TX_TYPE tx_type, int eob,
-                                            const int bd) {
-  (void)eob;
-  TX_SIZE tx_size = TX_4X16;
-  int32x4_t buf1[16];
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_h_div8 = txfm_size_row >> 2;
-  const transform_1d_neon row_txfm =
-      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
-  const transform_1d_neon col_txfm =
-      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
-  const int input_stride = AOMMIN(32, txfm_size_col);
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  // 1st stage: column transform
-  int32x4_t buf0[16];
-  const int32_t *input_row = input;
-  int32x4_t *buf0_cur = buf0;
-  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
-  for (int i = 0; i < (txfm_size_row >> 2); i++) {
-    row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
-  }
-
-  if (lr_flip) {
-    for (int j = 0; j < buf_size_h_div8; ++j) {
-      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
-                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
-                    buf1[4 * j + 3]);
-    }
-  } else {
-    for (int j = 0; j < buf_size_h_div8; ++j) {
-      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
-                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
-                    buf1[4 * j + 2], buf1[4 * j + 3]);
-    }
-  }
-
-  // 2nd stage: column transform
-  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
-
-  round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
-
-  // write to buffer
-  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
-                               bd);
-}
-
-static void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
-                                            uint16_t *output, int stride,
-                                            TX_TYPE tx_type, int eob,
-                                            const int bd) {
-  (void)eob;
-  TX_SIZE tx_size = TX_16X4;
-  int32x4_t buf1[16];
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div8 = txfm_size_col >> 2;
-  const transform_1d_neon row_txfm =
-      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
-  const transform_1d_neon col_txfm =
-      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  // 1st stage: column transform
-  int32x4_t buf0[16];
-  const int32_t *input_row = input;
-  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
-
-  for (int j = 0; j < buf_size_w_div8; j++) {
-    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
-                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
-  }
-  row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]);
-
-  int32x4_t *buf1_ptr;
-  if (lr_flip) {
-    flip_buf_neon(buf0, buf1, txfm_size_col);
-    buf1_ptr = buf1;
-  } else {
-    buf1_ptr = buf0;
-  }
-
-  // 2nd stage: column transform
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
-             INV_COS_BIT, 1, bd, 0);
-  }
-  round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
-
-  // write to buffer
-  for (int i = 0; i < (txfm_size_col >> 3); i++) {
-    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
-                                 output + 8 * i, stride, ud_flip, txfm_size_row,
-                                 bd);
-  }
-}
-
 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
   0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -5658,8 +5544,9 @@ static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -5682,8 +5569,9 @@ void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
@@ -5699,210 +5587,79 @@ void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
                               bd);
 }
 
-void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
-                              txfm_param->tx_type, txfm_param->bd);
-}
-
-void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
-                              txfm_param->tx_type, txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16,
                                bd);
 }
 
-void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  int eob = txfm_param->eob;
-  highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                  tx_type, eob, bd);
-}
-
-void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  int eob = txfm_param->eob;
-  highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                  tx_type, eob, bd);
-}
-
-void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_8X16, txfm_param->eob, txfm_param->bd);
-}
-
-void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_16X8, txfm_param->eob, txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8,
                                bd);
 }
 
-void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_16X32, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_16X32, bd);
 }
 
-void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_32X16, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_32X16, bd);
 }
 
-void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_32X32, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_32X32, bd);
 }
 
-void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_64X64, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_64X64, bd);
 }
 
-void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_32X64, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_32X64, bd);
 }
 
-void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_64X32, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_64X32, bd);
 }
 
-void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_64X16, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_64X16, bd);
 }
 
-void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_16X64, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_16X64, bd);
 }
 
-void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_16X16, txfm_param->eob,
-                                      txfm_param->bd);
-}
-
-void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest,
-                                   int stride, TX_TYPE tx_type, const int bd) {
+static void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input,
+                                          uint16_t *dest, int stride,
+                                          TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
                                TX_16X16, bd);
 }
 
-void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_32X8, txfm_param->eob, txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8,
                                bd);
 }
 
-void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
-                                      TX_8X32, txfm_param->eob, txfm_param->bd);
-}
-
 void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, TX_TYPE tx_type, const int bd) {
   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32,
@@ -5917,7 +5674,7 @@ void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   switch (tx_size) {
     case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
       av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
@@ -5928,7 +5685,7 @@ void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
                                   txfm_param->tx_type, txfm_param->bd);
       break;
     case TX_4X4:
-      av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
       av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index a24d3a953f..4700098fe1 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -150,43 +150,6 @@ specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
 
-add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x16  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x8  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x32  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x16  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
-add_proto qw/void av1_highbd_inv_txfm_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
-
 add_proto qw/void av1_inv_txfm2d_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
 specialize qw/av1_inv_txfm2d_add_4x4 neon/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
diff --git a/av1/common/idct.c b/av1/common/idct.c
index bff438f3c6..d4b1c98831 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -39,8 +39,8 @@ void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
     av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
@@ -56,94 +56,104 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
   av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -151,8 +161,9 @@ void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
   av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -161,22 +172,25 @@ void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
                              bd);
 }
 
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -185,8 +199,9 @@ void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
                              bd);
 }
 
-void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -215,64 +230,64 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
       break;
     case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
       break;
     case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
       break;
     case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
       break;
     case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
       break;
     case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
       break;
     case TX_64X64:
-      av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
-      av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
       break;
     case TX_64X32:
-      av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
       break;
     case TX_16X64:
-      av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
       break;
     case TX_64X16:
-      av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
       break;
     case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
       break;
     case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index cbfe5614c3..73e6911d0b 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4180,10 +4180,11 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
   }
 }
 
-void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
-                                             uint8_t *output, int stride,
-                                             TX_TYPE tx_type, TX_SIZE tx_size,
-                                             int eob, const int bd) {
+static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+                                                    uint8_t *output, int stride,
+                                                    TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -4216,19 +4217,11 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_4X4:
-      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
       break;
     default:
       av1_highbd_inv_txfm2d_add_universe_avx2(
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 4ff6a90f95..c12022cff8 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5125,9 +5125,9 @@ static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   }
 }
 
-void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input,
+                                               uint8_t *dest, int stride,
+                                               const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
@@ -5149,9 +5149,9 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
       break;
   }
 }
-void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input,
+                                               uint8_t *dest, int stride,
+                                               const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
@@ -5754,9 +5754,9 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
   }
 }
 
-void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input,
+                                               uint8_t *dest, int stride,
+                                               const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
@@ -5765,9 +5765,9 @@ void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
                                   tx_type, tx_size, eob, bd);
 }
 
-void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input,
+                                               uint8_t *dest, int stride,
+                                               const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
@@ -5776,9 +5776,9 @@ void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
                                   tx_type, tx_size, eob, bd);
 }
 
-void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input,
+                                                uint8_t *dest, int stride,
+                                                const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
@@ -5787,9 +5787,9 @@ void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
                                     tx_type, tx_size, eob, bd);
 }
 
-void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
+static void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input,
+                                                uint8_t *dest, int stride,
+                                                const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-- 
GitLab


From b7c05bdcd536963338f093597e671a2669a36a8d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 11 Jun 2024 19:15:10 -0700
Subject: [PATCH 218/391] *.{c,cc,h}: normalize AOMedia copyright

Use a consistent copyright line and update instances of a WebM copyright
body to contain the AOMedia one.

Bug: aomedia:3525
Change-Id: Icb2e5d228850a9ab81eb695d7abfcadeaca29481
---
 aom/aom.h                                        |  2 +-
 aom/aom_codec.h                                  |  2 +-
 aom/aom_decoder.h                                |  2 +-
 aom/aom_encoder.h                                |  2 +-
 aom/aom_external_partition.h                     |  2 +-
 aom/aom_frame_buffer.h                           |  2 +-
 aom/aom_image.h                                  |  2 +-
 aom/aom_integer.h                                |  2 +-
 aom/aomcx.h                                      |  2 +-
 aom/aomdx.h                                      |  2 +-
 aom/internal/aom_codec_internal.h                |  2 +-
 aom/internal/aom_image_internal.h                |  2 +-
 aom/src/aom_codec.c                              |  2 +-
 aom/src/aom_decoder.c                            |  2 +-
 aom/src/aom_encoder.c                            |  2 +-
 aom/src/aom_image.c                              |  2 +-
 aom/src/aom_integer.c                            |  2 +-
 aom_dsp/aom_convolve.c                           |  2 +-
 aom_dsp/aom_dsp_common.h                         |  2 +-
 aom_dsp/aom_dsp_rtcd.c                           |  2 +-
 aom_dsp/aom_filter.h                             |  2 +-
 aom_dsp/aom_simd.h                               |  2 +-
 aom_dsp/aom_simd_inline.h                        |  2 +-
 aom_dsp/arm/aom_convolve8_neon.c                 |  4 ++--
 aom_dsp/arm/aom_convolve8_neon.h                 | 13 +++++++------
 aom_dsp/arm/aom_convolve8_neon_dotprod.c         |  4 ++--
 aom_dsp/arm/aom_convolve8_neon_i8mm.c            |  4 ++--
 aom_dsp/arm/aom_convolve_copy_neon.c             | 13 +++++++------
 aom_dsp/arm/aom_filter.h                         |  2 +-
 aom_dsp/arm/aom_neon_sve2_bridge.h               | 13 +++++++------
 aom_dsp/arm/aom_neon_sve_bridge.h                | 13 +++++++------
 aom_dsp/arm/avg_neon.c                           | 13 +++++++------
 aom_dsp/arm/avg_pred_neon.c                      |  2 +-
 aom_dsp/arm/avg_sve.c                            | 13 +++++++------
 aom_dsp/arm/blend_a64_mask_neon.c                |  2 +-
 aom_dsp/arm/blend_neon.h                         |  2 +-
 aom_dsp/arm/blk_sse_sum_neon.c                   |  2 +-
 aom_dsp/arm/blk_sse_sum_sve.c                    |  2 +-
 aom_dsp/arm/dist_wtd_avg_neon.h                  | 13 +++++++------
 aom_dsp/arm/fwd_txfm_neon.c                      |  2 +-
 aom_dsp/arm/hadamard_neon.c                      | 13 +++++++------
 aom_dsp/arm/highbd_avg_neon.c                    |  4 ++--
 aom_dsp/arm/highbd_avg_pred_neon.c               |  4 ++--
 aom_dsp/arm/highbd_blend_a64_hmask_neon.c        |  2 +-
 aom_dsp/arm/highbd_blend_a64_mask_neon.c         |  2 +-
 aom_dsp/arm/highbd_blend_a64_vmask_neon.c        |  2 +-
 aom_dsp/arm/highbd_convolve8_neon.c              |  4 ++--
 aom_dsp/arm/highbd_convolve8_neon.h              | 13 +++++++------
 aom_dsp/arm/highbd_convolve8_sve.c               |  2 +-
 aom_dsp/arm/highbd_hadamard_neon.c               |  4 ++--
 aom_dsp/arm/highbd_intrapred_neon.c              |  2 +-
 aom_dsp/arm/highbd_loopfilter_neon.c             |  2 +-
 aom_dsp/arm/highbd_masked_sad_neon.c             |  2 +-
 aom_dsp/arm/highbd_obmc_sad_neon.c               |  2 +-
 aom_dsp/arm/highbd_obmc_variance_neon.c          |  2 +-
 aom_dsp/arm/highbd_quantize_neon.c               |  2 +-
 aom_dsp/arm/highbd_sad_neon.c                    |  4 ++--
 aom_dsp/arm/highbd_sadxd_neon.c                  |  4 ++--
 aom_dsp/arm/highbd_sse_neon.c                    | 13 +++++++------
 aom_dsp/arm/highbd_sse_sve.c                     | 13 +++++++------
 aom_dsp/arm/highbd_subpel_variance_neon.c        |  4 ++--
 aom_dsp/arm/highbd_variance_neon.c               |  4 ++--
 aom_dsp/arm/highbd_variance_neon_dotprod.c       |  4 ++--
 aom_dsp/arm/highbd_variance_sve.c                |  2 +-
 aom_dsp/arm/intrapred_neon.c                     |  2 +-
 aom_dsp/arm/loopfilter_neon.c                    |  2 +-
 aom_dsp/arm/masked_sad4d_neon.c                  |  2 +-
 aom_dsp/arm/masked_sad_neon.c                    |  2 +-
 aom_dsp/arm/mem_neon.h                           | 13 +++++++------
 aom_dsp/arm/obmc_sad_neon.c                      |  2 +-
 aom_dsp/arm/obmc_variance_neon.c                 |  2 +-
 aom_dsp/arm/reinterpret_neon.h                   | 13 +++++++------
 aom_dsp/arm/sad_neon.c                           |  2 +-
 aom_dsp/arm/sad_neon_dotprod.c                   |  2 +-
 aom_dsp/arm/sadxd_neon.c                         |  2 +-
 aom_dsp/arm/sadxd_neon_dotprod.c                 |  2 +-
 aom_dsp/arm/sse_neon.c                           | 13 +++++++------
 aom_dsp/arm/sse_neon_dotprod.c                   | 13 +++++++------
 aom_dsp/arm/subpel_variance_neon.c               |  2 +-
 aom_dsp/arm/subtract_neon.c                      |  2 +-
 aom_dsp/arm/sum_neon.h                           | 13 +++++++------
 aom_dsp/arm/sum_squares_neon.c                   |  2 +-
 aom_dsp/arm/sum_squares_neon_dotprod.c           |  2 +-
 aom_dsp/arm/sum_squares_sve.c                    |  2 +-
 aom_dsp/arm/transpose_neon.h                     | 13 +++++++------
 aom_dsp/arm/variance_neon.c                      |  2 +-
 aom_dsp/arm/variance_neon_dotprod.c              |  2 +-
 aom_dsp/avg.c                                    |  2 +-
 aom_dsp/binary_codes_reader.c                    |  2 +-
 aom_dsp/binary_codes_reader.h                    |  2 +-
 aom_dsp/binary_codes_writer.c                    |  2 +-
 aom_dsp/binary_codes_writer.h                    |  2 +-
 aom_dsp/bitreader.c                              |  2 +-
 aom_dsp/bitreader.h                              |  2 +-
 aom_dsp/bitreader_buffer.c                       |  2 +-
 aom_dsp/bitreader_buffer.h                       |  2 +-
 aom_dsp/bitwriter.c                              |  2 +-
 aom_dsp/bitwriter.h                              |  2 +-
 aom_dsp/bitwriter_buffer.c                       |  2 +-
 aom_dsp/bitwriter_buffer.h                       |  2 +-
 aom_dsp/blend.h                                  |  2 +-
 aom_dsp/blend_a64_hmask.c                        |  2 +-
 aom_dsp/blend_a64_mask.c                         |  2 +-
 aom_dsp/blend_a64_vmask.c                        |  2 +-
 aom_dsp/blk_sse_sum.c                            |  2 +-
 aom_dsp/butteraugli.c                            |  2 +-
 aom_dsp/butteraugli.h                            |  2 +-
 aom_dsp/entcode.c                                |  2 +-
 aom_dsp/entcode.h                                |  2 +-
 aom_dsp/entdec.c                                 |  2 +-
 aom_dsp/entdec.h                                 |  2 +-
 aom_dsp/entenc.c                                 |  2 +-
 aom_dsp/entenc.h                                 |  2 +-
 aom_dsp/fastssim.c                               |  2 +-
 aom_dsp/fft.c                                    |  2 +-
 aom_dsp/fft_common.h                             |  2 +-
 aom_dsp/flow_estimation/arm/disflow_neon.c       |  2 +-
 aom_dsp/flow_estimation/arm/disflow_neon.h       |  2 +-
 aom_dsp/flow_estimation/arm/disflow_sve.c        |  2 +-
 aom_dsp/flow_estimation/corner_detect.c          |  2 +-
 aom_dsp/flow_estimation/corner_detect.h          |  2 +-
 aom_dsp/flow_estimation/corner_match.c           |  2 +-
 aom_dsp/flow_estimation/corner_match.h           |  2 +-
 aom_dsp/flow_estimation/disflow.c                |  2 +-
 aom_dsp/flow_estimation/disflow.h                |  2 +-
 aom_dsp/flow_estimation/flow_estimation.c        |  2 +-
 aom_dsp/flow_estimation/flow_estimation.h        |  2 +-
 aom_dsp/flow_estimation/ransac.c                 |  2 +-
 aom_dsp/flow_estimation/ransac.h                 |  2 +-
 aom_dsp/flow_estimation/x86/corner_match_avx2.c  |  2 +-
 aom_dsp/flow_estimation/x86/corner_match_sse4.c  |  2 +-
 aom_dsp/flow_estimation/x86/disflow_avx2.c       |  2 +-
 aom_dsp/flow_estimation/x86/disflow_sse4.c       |  2 +-
 aom_dsp/fwd_txfm.c                               |  2 +-
 aom_dsp/grain_params.h                           |  2 +-
 aom_dsp/grain_table.c                            |  2 +-
 aom_dsp/grain_table.h                            |  2 +-
 aom_dsp/intrapred.c                              |  2 +-
 aom_dsp/intrapred_common.h                       |  2 +-
 aom_dsp/loopfilter.c                             |  2 +-
 aom_dsp/mathutils.h                              |  2 +-
 aom_dsp/noise_model.c                            |  2 +-
 aom_dsp/noise_model.h                            |  2 +-
 aom_dsp/noise_util.c                             |  2 +-
 aom_dsp/noise_util.h                             |  2 +-
 aom_dsp/odintrin.c                               |  2 +-
 aom_dsp/odintrin.h                               |  2 +-
 aom_dsp/prob.h                                   |  2 +-
 aom_dsp/psnr.c                                   |  2 +-
 aom_dsp/psnr.h                                   |  2 +-
 aom_dsp/psnrhvs.c                                |  2 +-
 aom_dsp/pyramid.c                                |  2 +-
 aom_dsp/pyramid.h                                |  2 +-
 aom_dsp/quantize.c                               |  2 +-
 aom_dsp/quantize.h                               |  2 +-
 aom_dsp/recenter.h                               |  2 +-
 aom_dsp/sad.c                                    |  2 +-
 aom_dsp/sad_av1.c                                |  2 +-
 aom_dsp/simd/v128_intrinsics.h                   |  2 +-
 aom_dsp/simd/v128_intrinsics_c.h                 |  2 +-
 aom_dsp/simd/v128_intrinsics_x86.h               |  2 +-
 aom_dsp/simd/v256_intrinsics.h                   |  2 +-
 aom_dsp/simd/v256_intrinsics_c.h                 |  2 +-
 aom_dsp/simd/v256_intrinsics_v128.h              |  2 +-
 aom_dsp/simd/v256_intrinsics_x86.h               |  2 +-
 aom_dsp/simd/v64_intrinsics.h                    |  2 +-
 aom_dsp/simd/v64_intrinsics_c.h                  |  2 +-
 aom_dsp/simd/v64_intrinsics_x86.h                |  2 +-
 aom_dsp/sse.c                                    |  2 +-
 aom_dsp/ssim.c                                   |  2 +-
 aom_dsp/ssim.h                                   |  2 +-
 aom_dsp/subtract.c                               |  2 +-
 aom_dsp/sum_squares.c                            |  2 +-
 aom_dsp/txfm_common.h                            |  2 +-
 aom_dsp/variance.c                               |  2 +-
 aom_dsp/variance.h                               |  2 +-
 aom_dsp/vmaf.c                                   |  2 +-
 aom_dsp/vmaf.h                                   |  2 +-
 aom_dsp/x86/adaptive_quantize_avx2.c             |  2 +-
 aom_dsp/x86/adaptive_quantize_sse2.c             |  2 +-
 aom_dsp/x86/aom_convolve_copy_avx2.c             | 13 +++++++------
 aom_dsp/x86/aom_convolve_copy_sse2.c             | 13 +++++++------
 aom_dsp/x86/aom_quantize_avx.c                   |  2 +-
 aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c        |  2 +-
 aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c       |  2 +-
 aom_dsp/x86/avg_intrin_avx2.c                    |  2 +-
 aom_dsp/x86/avg_intrin_sse2.c                    |  2 +-
 aom_dsp/x86/avg_intrin_sse4.c                    |  2 +-
 aom_dsp/x86/bitdepth_conversion_avx2.h           |  2 +-
 aom_dsp/x86/bitdepth_conversion_sse2.h           |  2 +-
 aom_dsp/x86/blend_a64_hmask_sse4.c               |  2 +-
 aom_dsp/x86/blend_a64_mask_avx2.c                |  2 +-
 aom_dsp/x86/blend_a64_mask_sse4.c                |  2 +-
 aom_dsp/x86/blend_a64_vmask_sse4.c               |  2 +-
 aom_dsp/x86/blend_mask_sse4.h                    |  2 +-
 aom_dsp/x86/blend_sse4.h                         |  2 +-
 aom_dsp/x86/blk_sse_sum_avx2.c                   |  2 +-
 aom_dsp/x86/blk_sse_sum_sse2.c                   |  2 +-
 aom_dsp/x86/common_avx2.h                        |  2 +-
 aom_dsp/x86/convolve.h                           |  2 +-
 aom_dsp/x86/convolve_avx2.h                      |  2 +-
 aom_dsp/x86/convolve_common_intrin.h             |  2 +-
 aom_dsp/x86/convolve_sse2.h                      |  2 +-
 aom_dsp/x86/convolve_sse4_1.h                    |  2 +-
 aom_dsp/x86/convolve_ssse3.h                     |  2 +-
 aom_dsp/x86/fft_avx2.c                           |  2 +-
 aom_dsp/x86/fft_sse2.c                           |  2 +-
 aom_dsp/x86/fwd_txfm_impl_sse2.h                 |  2 +-
 aom_dsp/x86/fwd_txfm_sse2.c                      |  2 +-
 aom_dsp/x86/fwd_txfm_sse2.h                      |  2 +-
 aom_dsp/x86/highbd_adaptive_quantize_avx2.c      |  2 +-
 aom_dsp/x86/highbd_adaptive_quantize_sse2.c      |  2 +-
 aom_dsp/x86/highbd_convolve_avx2.c               |  2 +-
 aom_dsp/x86/highbd_convolve_sse2.c               |  2 +-
 aom_dsp/x86/highbd_convolve_ssse3.c              |  2 +-
 aom_dsp/x86/highbd_intrapred_sse2.c              |  2 +-
 aom_dsp/x86/highbd_loopfilter_avx2.c             |  2 +-
 aom_dsp/x86/highbd_loopfilter_sse2.c             |  2 +-
 aom_dsp/x86/highbd_quantize_intrin_avx2.c        |  2 +-
 aom_dsp/x86/highbd_quantize_intrin_sse2.c        |  2 +-
 aom_dsp/x86/highbd_sad_avx2.c                    |  2 +-
 aom_dsp/x86/highbd_subtract_sse2.c               |  2 +-
 aom_dsp/x86/highbd_variance_avx2.c               |  2 +-
 aom_dsp/x86/highbd_variance_sse2.c               |  2 +-
 aom_dsp/x86/highbd_variance_sse4.c               |  2 +-
 aom_dsp/x86/intrapred_avx2.c                     |  2 +-
 aom_dsp/x86/intrapred_sse2.c                     |  2 +-
 aom_dsp/x86/intrapred_sse4.c                     |  2 +-
 aom_dsp/x86/intrapred_ssse3.c                    |  2 +-
 aom_dsp/x86/intrapred_utils.h                    |  2 +-
 aom_dsp/x86/intrapred_x86.h                      |  2 +-
 aom_dsp/x86/jnt_sad_sse2.c                       |  2 +-
 aom_dsp/x86/jnt_variance_ssse3.c                 |  2 +-
 aom_dsp/x86/loopfilter_avx2.c                    |  2 +-
 aom_dsp/x86/loopfilter_sse2.c                    |  2 +-
 aom_dsp/x86/lpf_common_sse2.h                    |  2 +-
 aom_dsp/x86/masked_sad4d_ssse3.c                 |  2 +-
 aom_dsp/x86/masked_sad_intrin_avx2.c             |  2 +-
 aom_dsp/x86/masked_sad_intrin_ssse3.c            |  2 +-
 aom_dsp/x86/masked_sad_intrin_ssse3.h            |  2 +-
 aom_dsp/x86/masked_variance_intrin_ssse3.c       |  2 +-
 aom_dsp/x86/masked_variance_intrin_ssse3.h       |  2 +-
 aom_dsp/x86/mem_sse2.h                           |  2 +-
 aom_dsp/x86/obmc_intrinsic_sse4.h                |  2 +-
 aom_dsp/x86/obmc_intrinsic_ssse3.h               |  2 +-
 aom_dsp/x86/obmc_sad_avx2.c                      |  2 +-
 aom_dsp/x86/obmc_sad_sse4.c                      |  2 +-
 aom_dsp/x86/obmc_variance_avx2.c                 |  2 +-
 aom_dsp/x86/obmc_variance_sse4.c                 |  2 +-
 aom_dsp/x86/quantize_avx2.c                      |  2 +-
 aom_dsp/x86/quantize_sse2.c                      |  2 +-
 aom_dsp/x86/quantize_ssse3.c                     |  2 +-
 aom_dsp/x86/quantize_x86.h                       |  2 +-
 aom_dsp/x86/sad4d_avx2.c                         |  2 +-
 aom_dsp/x86/sad_avx2.c                           |  2 +-
 aom_dsp/x86/sad_impl_avx2.c                      |  2 +-
 aom_dsp/x86/sse_avx2.c                           |  2 +-
 aom_dsp/x86/sse_sse4.c                           |  2 +-
 aom_dsp/x86/subtract_avx2.c                      |  2 +-
 aom_dsp/x86/sum_squares_avx2.c                   |  2 +-
 aom_dsp/x86/sum_squares_sse2.c                   |  2 +-
 aom_dsp/x86/sum_squares_sse2.h                   |  2 +-
 aom_dsp/x86/synonyms.h                           |  2 +-
 aom_dsp/x86/synonyms_avx2.h                      |  2 +-
 aom_dsp/x86/transpose_sse2.h                     |  2 +-
 aom_dsp/x86/txfm_common_avx2.h                   |  2 +-
 aom_dsp/x86/txfm_common_sse2.h                   |  2 +-
 aom_dsp/x86/variance_avx2.c                      |  2 +-
 aom_dsp/x86/variance_impl_avx2.c                 |  2 +-
 aom_dsp/x86/variance_impl_ssse3.c                |  2 +-
 aom_dsp/x86/variance_impl_ssse3.h                |  2 +-
 aom_dsp/x86/variance_sse2.c                      |  2 +-
 aom_dsp/x86/variance_ssse3.c                     |  2 +-
 aom_mem/aom_mem.c                                |  2 +-
 aom_mem/aom_mem.h                                |  2 +-
 aom_mem/include/aom_mem_intrnl.h                 |  2 +-
 aom_ports/aarch32_cpudetect.c                    |  2 +-
 aom_ports/aarch64_cpudetect.c                    |  2 +-
 aom_ports/aom_once.h                             |  2 +-
 aom_ports/aom_timer.h                            |  2 +-
 aom_ports/arm.h                                  |  2 +-
 aom_ports/arm_cpudetect.h                        |  2 +-
 aom_ports/bitops.h                               |  2 +-
 aom_ports/emmintrin_compat.h                     |  2 +-
 aom_ports/mem.h                                  |  2 +-
 aom_ports/mem_ops.h                              |  2 +-
 aom_ports/mem_ops_aligned.h                      |  2 +-
 aom_ports/ppc.h                                  |  2 +-
 aom_ports/ppc_cpudetect.c                        |  2 +-
 aom_ports/sanitizer.h                            |  2 +-
 aom_ports/x86.h                                  |  2 +-
 aom_scale/aom_scale.h                            |  2 +-
 aom_scale/aom_scale_rtcd.c                       |  2 +-
 aom_scale/generic/aom_scale.c                    |  2 +-
 aom_scale/generic/gen_scalers.c                  |  2 +-
 aom_scale/generic/yv12config.c                   |  2 +-
 aom_scale/generic/yv12extend.c                   |  2 +-
 aom_scale/yv12config.h                           |  2 +-
 aom_util/aom_pthread.h                           |  2 +-
 aom_util/aom_thread.c                            |  2 +-
 aom_util/aom_thread.h                            |  2 +-
 aom_util/debug_util.c                            |  2 +-
 aom_util/debug_util.h                            |  2 +-
 aom_util/endian_inl.h                            |  2 +-
 apps/aomdec.c                                    |  2 +-
 apps/aomenc.c                                    |  2 +-
 apps/aomenc.h                                    |  2 +-
 av1/arg_defs.c                                   |  2 +-
 av1/arg_defs.h                                   |  2 +-
 av1/av1_cx_iface.c                               |  2 +-
 av1/av1_cx_iface.h                               |  2 +-
 av1/av1_dx_iface.c                               |  2 +-
 av1/av1_iface_common.h                           |  2 +-
 av1/common/alloccommon.c                         |  2 +-
 av1/common/alloccommon.h                         |  2 +-
 av1/common/arm/av1_convolve_scale_neon.c         |  2 +-
 av1/common/arm/av1_convolve_scale_neon_dotprod.c |  2 +-
 av1/common/arm/av1_convolve_scale_neon_i8mm.c    |  2 +-
 av1/common/arm/av1_inv_txfm_neon.c               |  2 +-
 av1/common/arm/av1_inv_txfm_neon.h               |  2 +-
 av1/common/arm/av1_txfm_neon.c                   |  2 +-
 av1/common/arm/blend_a64_hmask_neon.c            |  2 +-
 av1/common/arm/blend_a64_vmask_neon.c            |  2 +-
 av1/common/arm/cdef_block_neon.c                 |  2 +-
 av1/common/arm/cfl_neon.c                        |  2 +-
 av1/common/arm/compound_convolve_neon.c          |  2 +-
 av1/common/arm/compound_convolve_neon.h          |  2 +-
 av1/common/arm/compound_convolve_neon_dotprod.c  |  2 +-
 av1/common/arm/compound_convolve_neon_i8mm.c     |  2 +-
 av1/common/arm/convolve_neon.c                   |  2 +-
 av1/common/arm/convolve_neon.h                   | 13 +++++++------
 av1/common/arm/convolve_neon_dotprod.c           |  2 +-
 av1/common/arm/convolve_neon_i8mm.c              |  2 +-
 av1/common/arm/convolve_neon_i8mm.h              |  2 +-
 av1/common/arm/convolve_scale_neon.h             | 13 +++++++------
 av1/common/arm/convolve_sve2.c                   |  2 +-
 av1/common/arm/highbd_compound_convolve_neon.c   |  2 +-
 av1/common/arm/highbd_compound_convolve_neon.h   |  2 +-
 av1/common/arm/highbd_compound_convolve_sve2.c   |  2 +-
 av1/common/arm/highbd_convolve_horiz_rs_neon.c   |  2 +-
 av1/common/arm/highbd_convolve_neon.c            |  2 +-
 av1/common/arm/highbd_convolve_neon.h            |  2 +-
 av1/common/arm/highbd_convolve_scale_neon.c      |  2 +-
 av1/common/arm/highbd_convolve_sve2.c            |  2 +-
 av1/common/arm/highbd_convolve_sve2.h            |  2 +-
 av1/common/arm/highbd_inv_txfm_neon.c            |  2 +-
 av1/common/arm/highbd_reconinter_neon.c          |  2 +-
 av1/common/arm/highbd_reconintra_neon.c          |  2 +-
 av1/common/arm/highbd_warp_plane_neon.c          |  2 +-
 av1/common/arm/highbd_warp_plane_neon.h          |  2 +-
 av1/common/arm/highbd_warp_plane_sve.c           |  2 +-
 av1/common/arm/highbd_wiener_convolve_neon.c     |  2 +-
 av1/common/arm/reconinter_neon.c                 |  2 +-
 av1/common/arm/reconintra_neon.c                 |  2 +-
 av1/common/arm/resize_neon.c                     |  2 +-
 av1/common/arm/selfguided_neon.c                 |  2 +-
 av1/common/arm/warp_plane_neon.c                 |  2 +-
 av1/common/arm/warp_plane_neon.h                 |  2 +-
 av1/common/arm/warp_plane_neon_i8mm.c            |  2 +-
 av1/common/arm/warp_plane_sve.c                  |  2 +-
 av1/common/arm/wiener_convolve_neon.c            |  2 +-
 av1/common/av1_common_int.h                      |  2 +-
 av1/common/av1_inv_txfm1d.c                      |  2 +-
 av1/common/av1_inv_txfm1d.h                      |  2 +-
 av1/common/av1_inv_txfm1d_cfg.h                  |  2 +-
 av1/common/av1_inv_txfm2d.c                      |  2 +-
 av1/common/av1_loopfilter.c                      |  2 +-
 av1/common/av1_loopfilter.h                      |  2 +-
 av1/common/av1_rtcd.c                            |  2 +-
 av1/common/av1_txfm.c                            |  2 +-
 av1/common/av1_txfm.h                            |  2 +-
 av1/common/blockd.c                              |  2 +-
 av1/common/blockd.h                              |  2 +-
 av1/common/cdef.c                                |  2 +-
 av1/common/cdef.h                                |  2 +-
 av1/common/cdef_block.c                          |  2 +-
 av1/common/cdef_block.h                          |  2 +-
 av1/common/cdef_block_simd.h                     |  2 +-
 av1/common/cfl.c                                 |  2 +-
 av1/common/cfl.h                                 |  2 +-
 av1/common/common.h                              |  2 +-
 av1/common/common_data.c                         |  2 +-
 av1/common/common_data.h                         |  2 +-
 av1/common/convolve.c                            |  2 +-
 av1/common/convolve.h                            |  2 +-
 av1/common/debugmodes.c                          |  2 +-
 av1/common/debugmodes.h                          |  2 +-
 av1/common/entropy.c                             |  2 +-
 av1/common/entropy.h                             |  2 +-
 av1/common/entropymode.c                         |  2 +-
 av1/common/entropymode.h                         |  2 +-
 av1/common/entropymv.c                           |  2 +-
 av1/common/entropymv.h                           |  2 +-
 av1/common/enums.h                               |  2 +-
 av1/common/filter.h                              |  2 +-
 av1/common/frame_buffers.c                       |  2 +-
 av1/common/frame_buffers.h                       |  2 +-
 av1/common/idct.c                                |  2 +-
 av1/common/idct.h                                |  2 +-
 av1/common/mv.h                                  |  2 +-
 av1/common/mvref_common.c                        |  2 +-
 av1/common/mvref_common.h                        |  2 +-
 av1/common/obmc.h                                |  2 +-
 av1/common/obu_util.c                            |  2 +-
 av1/common/obu_util.h                            |  2 +-
 av1/common/ppc/cfl_ppc.c                         |  2 +-
 av1/common/pred_common.c                         |  2 +-
 av1/common/pred_common.h                         |  2 +-
 av1/common/quant_common.c                        |  2 +-
 av1/common/quant_common.h                        |  2 +-
 av1/common/reconinter.c                          |  2 +-
 av1/common/reconinter.h                          |  2 +-
 av1/common/reconintra.c                          |  2 +-
 av1/common/reconintra.h                          |  2 +-
 av1/common/resize.c                              |  2 +-
 av1/common/resize.h                              |  2 +-
 av1/common/restoration.c                         |  2 +-
 av1/common/restoration.h                         |  2 +-
 av1/common/scale.c                               |  2 +-
 av1/common/scale.h                               |  2 +-
 av1/common/scan.c                                |  2 +-
 av1/common/scan.h                                |  2 +-
 av1/common/seg_common.c                          |  2 +-
 av1/common/seg_common.h                          |  2 +-
 av1/common/thread_common.c                       |  2 +-
 av1/common/thread_common.h                       |  2 +-
 av1/common/tile_common.c                         |  2 +-
 av1/common/tile_common.h                         |  2 +-
 av1/common/timing.c                              |  2 +-
 av1/common/timing.h                              |  2 +-
 av1/common/token_cdfs.h                          |  2 +-
 av1/common/txb_common.c                          |  2 +-
 av1/common/txb_common.h                          |  2 +-
 av1/common/warped_motion.c                       |  2 +-
 av1/common/warped_motion.h                       |  2 +-
 av1/common/x86/av1_convolve_horiz_rs_sse4.c      |  2 +-
 av1/common/x86/av1_convolve_scale_sse4.c         |  2 +-
 av1/common/x86/av1_inv_txfm_avx2.c               |  2 +-
 av1/common/x86/av1_inv_txfm_avx2.h               |  2 +-
 av1/common/x86/av1_inv_txfm_ssse3.c              |  2 +-
 av1/common/x86/av1_inv_txfm_ssse3.h              |  2 +-
 av1/common/x86/av1_txfm_sse2.h                   |  2 +-
 av1/common/x86/av1_txfm_sse4.c                   |  2 +-
 av1/common/x86/av1_txfm_sse4.h                   |  2 +-
 av1/common/x86/cdef_block_avx2.c                 |  2 +-
 av1/common/x86/cdef_block_sse4.c                 |  2 +-
 av1/common/x86/cdef_block_ssse3.c                |  2 +-
 av1/common/x86/cfl_avx2.c                        |  2 +-
 av1/common/x86/cfl_simd.h                        |  2 +-
 av1/common/x86/cfl_sse2.c                        |  2 +-
 av1/common/x86/cfl_ssse3.c                       |  2 +-
 av1/common/x86/convolve_2d_avx2.c                |  2 +-
 av1/common/x86/convolve_2d_sse2.c                |  2 +-
 av1/common/x86/convolve_avx2.c                   |  2 +-
 av1/common/x86/convolve_sse2.c                   |  2 +-
 av1/common/x86/filterintra_sse4.c                |  2 +-
 av1/common/x86/highbd_convolve_2d_avx2.c         |  2 +-
 av1/common/x86/highbd_convolve_2d_sse4.c         |  2 +-
 av1/common/x86/highbd_convolve_2d_ssse3.c        |  2 +-
 av1/common/x86/highbd_inv_txfm_avx2.c            |  2 +-
 av1/common/x86/highbd_inv_txfm_sse4.c            |  2 +-
 av1/common/x86/highbd_jnt_convolve_avx2.c        |  2 +-
 av1/common/x86/highbd_jnt_convolve_sse4.c        |  2 +-
 av1/common/x86/highbd_txfm_utility_sse4.h        |  2 +-
 av1/common/x86/highbd_warp_affine_avx2.c         |  2 +-
 av1/common/x86/highbd_warp_plane_sse4.c          |  2 +-
 av1/common/x86/highbd_wiener_convolve_avx2.c     |  2 +-
 av1/common/x86/highbd_wiener_convolve_ssse3.c    |  2 +-
 av1/common/x86/intra_edge_sse4.c                 |  2 +-
 av1/common/x86/jnt_convolve_avx2.c               |  2 +-
 av1/common/x86/jnt_convolve_sse2.c               |  2 +-
 av1/common/x86/jnt_convolve_ssse3.c              |  2 +-
 av1/common/x86/reconinter_avx2.c                 |  2 +-
 av1/common/x86/reconinter_sse4.c                 |  2 +-
 av1/common/x86/reconinter_ssse3.c                |  2 +-
 av1/common/x86/resize_avx2.c                     |  2 +-
 av1/common/x86/resize_sse2.c                     |  2 +-
 av1/common/x86/resize_ssse3.c                    |  2 +-
 av1/common/x86/selfguided_avx2.c                 |  2 +-
 av1/common/x86/selfguided_sse4.c                 |  2 +-
 av1/common/x86/warp_plane_avx2.c                 |  2 +-
 av1/common/x86/warp_plane_sse4.c                 |  2 +-
 av1/common/x86/wiener_convolve_avx2.c            |  2 +-
 av1/common/x86/wiener_convolve_sse2.c            |  2 +-
 av1/decoder/accounting.c                         |  2 +-
 av1/decoder/accounting.h                         |  2 +-
 av1/decoder/decodeframe.c                        |  2 +-
 av1/decoder/decodeframe.h                        |  2 +-
 av1/decoder/decodemv.c                           |  2 +-
 av1/decoder/decodemv.h                           |  2 +-
 av1/decoder/decoder.c                            |  2 +-
 av1/decoder/decoder.h                            |  2 +-
 av1/decoder/decodetxb.c                          |  2 +-
 av1/decoder/decodetxb.h                          |  2 +-
 av1/decoder/detokenize.c                         |  2 +-
 av1/decoder/detokenize.h                         |  2 +-
 av1/decoder/dthread.h                            |  2 +-
 av1/decoder/grain_synthesis.c                    |  2 +-
 av1/decoder/grain_synthesis.h                    |  2 +-
 av1/decoder/inspection.c                         |  2 +-
 av1/decoder/inspection.h                         |  2 +-
 av1/decoder/obu.c                                |  2 +-
 av1/decoder/obu.h                                |  2 +-
 av1/encoder/allintra_vis.c                       |  2 +-
 av1/encoder/allintra_vis.h                       |  2 +-
 av1/encoder/aq_complexity.c                      |  2 +-
 av1/encoder/aq_complexity.h                      |  2 +-
 av1/encoder/aq_cyclicrefresh.c                   |  2 +-
 av1/encoder/aq_cyclicrefresh.h                   |  2 +-
 av1/encoder/aq_variance.c                        |  2 +-
 av1/encoder/aq_variance.h                        |  2 +-
 av1/encoder/arm/av1_error_neon.c                 |  2 +-
 av1/encoder/arm/av1_error_sve.c                  |  2 +-
 av1/encoder/arm/av1_fwd_txfm2d_neon.c            |  2 +-
 av1/encoder/arm/av1_highbd_quantize_neon.c       |  2 +-
 av1/encoder/arm/av1_k_means_neon.c               | 13 +++++++------
 av1/encoder/arm/av1_temporal_denoiser_neon.c     |  2 +-
 av1/encoder/arm/cnn_neon.c                       |  2 +-
 av1/encoder/arm/encodetxb_neon.c                 |  2 +-
 av1/encoder/arm/hash_arm_crc32.c                 |  2 +-
 av1/encoder/arm/highbd_fwd_txfm_neon.c           |  2 +-
 av1/encoder/arm/highbd_pickrst_neon.c            |  2 +-
 av1/encoder/arm/highbd_pickrst_sve.c             |  2 +-
 av1/encoder/arm/highbd_rdopt_neon.c              |  2 +-
 av1/encoder/arm/highbd_temporal_filter_neon.c    |  2 +-
 av1/encoder/arm/hybrid_fwd_txfm_neon.c           |  2 +-
 av1/encoder/arm/ml_neon.c                        |  2 +-
 av1/encoder/arm/pickrst_neon.c                   |  2 +-
 av1/encoder/arm/pickrst_neon.h                   |  2 +-
 av1/encoder/arm/pickrst_sve.c                    |  2 +-
 av1/encoder/arm/pickrst_sve.h                    |  2 +-
 av1/encoder/arm/quantize_neon.c                  |  2 +-
 av1/encoder/arm/rdopt_neon.c                     |  2 +-
 av1/encoder/arm/reconinter_enc_neon.c            |  2 +-
 av1/encoder/arm/shift_neon.h                     |  2 +-
 av1/encoder/arm/temporal_filter_neon.c           |  2 +-
 av1/encoder/arm/temporal_filter_neon_dotprod.c   |  2 +-
 av1/encoder/arm/txfm_neon.h                      |  2 +-
 av1/encoder/arm/wedge_utils_neon.c               |  2 +-
 av1/encoder/arm/wedge_utils_sve.c                |  2 +-
 av1/encoder/av1_fwd_txfm1d.c                     |  2 +-
 av1/encoder/av1_fwd_txfm1d.h                     |  2 +-
 av1/encoder/av1_fwd_txfm1d_cfg.h                 |  2 +-
 av1/encoder/av1_fwd_txfm2d.c                     |  2 +-
 av1/encoder/av1_ml_partition_models.h            |  2 +-
 av1/encoder/av1_noise_estimate.c                 |  2 +-
 av1/encoder/av1_noise_estimate.h                 |  2 +-
 av1/encoder/av1_quantize.c                       |  2 +-
 av1/encoder/av1_quantize.h                       |  2 +-
 av1/encoder/av1_temporal_denoiser.c              |  2 +-
 av1/encoder/av1_temporal_denoiser.h              |  2 +-
 av1/encoder/bitstream.c                          |  2 +-
 av1/encoder/bitstream.h                          |  2 +-
 av1/encoder/block.h                              |  2 +-
 av1/encoder/blockiness.c                         |  2 +-
 av1/encoder/cnn.c                                |  2 +-
 av1/encoder/cnn.h                                |  2 +-
 av1/encoder/compound_type.c                      |  2 +-
 av1/encoder/compound_type.h                      |  2 +-
 av1/encoder/context_tree.c                       |  2 +-
 av1/encoder/context_tree.h                       |  2 +-
 av1/encoder/cost.c                               |  2 +-
 av1/encoder/cost.h                               |  2 +-
 av1/encoder/dwt.c                                |  2 +-
 av1/encoder/dwt.h                                |  2 +-
 av1/encoder/enc_enums.h                          |  2 +-
 av1/encoder/encode_strategy.c                    |  2 +-
 av1/encoder/encode_strategy.h                    |  2 +-
 av1/encoder/encodeframe.c                        |  2 +-
 av1/encoder/encodeframe.h                        |  2 +-
 av1/encoder/encodeframe_utils.c                  |  2 +-
 av1/encoder/encodeframe_utils.h                  |  2 +-
 av1/encoder/encodemb.c                           |  2 +-
 av1/encoder/encodemb.h                           |  2 +-
 av1/encoder/encodemv.c                           |  2 +-
 av1/encoder/encodemv.h                           |  2 +-
 av1/encoder/encoder.c                            |  2 +-
 av1/encoder/encoder.h                            |  2 +-
 av1/encoder/encoder_alloc.h                      |  2 +-
 av1/encoder/encoder_utils.c                      |  2 +-
 av1/encoder/encoder_utils.h                      |  2 +-
 av1/encoder/encodetxb.c                          |  2 +-
 av1/encoder/encodetxb.h                          |  2 +-
 av1/encoder/ethread.c                            |  2 +-
 av1/encoder/ethread.h                            |  2 +-
 av1/encoder/extend.c                             |  2 +-
 av1/encoder/extend.h                             |  2 +-
 av1/encoder/external_partition.c                 |  2 +-
 av1/encoder/external_partition.h                 |  2 +-
 av1/encoder/firstpass.c                          |  2 +-
 av1/encoder/firstpass.h                          |  2 +-
 av1/encoder/global_motion.c                      |  2 +-
 av1/encoder/global_motion.h                      |  2 +-
 av1/encoder/global_motion_facade.c               |  2 +-
 av1/encoder/global_motion_facade.h               |  2 +-
 av1/encoder/gop_structure.c                      |  2 +-
 av1/encoder/gop_structure.h                      |  2 +-
 av1/encoder/grain_test_vectors.h                 |  2 +-
 av1/encoder/hash.c                               |  2 +-
 av1/encoder/hash.h                               |  2 +-
 av1/encoder/hash_motion.c                        |  2 +-
 av1/encoder/hash_motion.h                        |  2 +-
 av1/encoder/hybrid_fwd_txfm.c                    |  2 +-
 av1/encoder/hybrid_fwd_txfm.h                    |  2 +-
 av1/encoder/interp_search.c                      |  2 +-
 av1/encoder/interp_search.h                      |  2 +-
 av1/encoder/intra_mode_search.c                  |  2 +-
 av1/encoder/intra_mode_search.h                  |  2 +-
 av1/encoder/intra_mode_search_utils.h            |  2 +-
 av1/encoder/k_means_template.h                   |  2 +-
 av1/encoder/level.c                              |  2 +-
 av1/encoder/level.h                              |  2 +-
 av1/encoder/lookahead.c                          |  2 +-
 av1/encoder/lookahead.h                          |  2 +-
 av1/encoder/mcomp.c                              |  2 +-
 av1/encoder/mcomp.h                              |  2 +-
 av1/encoder/mcomp_structs.h                      |  2 +-
 av1/encoder/misc_model_weights.h                 |  2 +-
 av1/encoder/ml.c                                 |  2 +-
 av1/encoder/ml.h                                 |  2 +-
 av1/encoder/mode_prune_model_weights.h           |  2 +-
 av1/encoder/model_rd.h                           |  2 +-
 av1/encoder/motion_search_facade.c               |  2 +-
 av1/encoder/motion_search_facade.h               |  2 +-
 av1/encoder/mv_prec.c                            |  2 +-
 av1/encoder/mv_prec.h                            |  2 +-
 av1/encoder/nonrd_opt.c                          |  2 +-
 av1/encoder/nonrd_opt.h                          |  2 +-
 av1/encoder/nonrd_pickmode.c                     |  2 +-
 av1/encoder/optical_flow.c                       |  2 +-
 av1/encoder/optical_flow.h                       |  2 +-
 av1/encoder/palette.c                            |  2 +-
 av1/encoder/palette.h                            |  2 +-
 av1/encoder/partition_cnn_weights.h              |  2 +-
 av1/encoder/partition_model_weights.h            |  2 +-
 av1/encoder/partition_search.c                   |  2 +-
 av1/encoder/partition_search.h                   |  2 +-
 av1/encoder/partition_strategy.c                 |  2 +-
 av1/encoder/partition_strategy.h                 |  2 +-
 av1/encoder/pass2_strategy.c                     |  2 +-
 av1/encoder/pass2_strategy.h                     |  2 +-
 av1/encoder/pickcdef.c                           |  2 +-
 av1/encoder/pickcdef.h                           |  2 +-
 av1/encoder/picklpf.c                            |  2 +-
 av1/encoder/picklpf.h                            |  2 +-
 av1/encoder/pickrst.c                            |  2 +-
 av1/encoder/pickrst.h                            |  2 +-
 av1/encoder/pustats.h                            |  2 +-
 av1/encoder/random.h                             |  2 +-
 av1/encoder/ratectrl.c                           |  2 +-
 av1/encoder/ratectrl.h                           |  2 +-
 av1/encoder/rc_utils.h                           |  2 +-
 av1/encoder/rd.c                                 |  2 +-
 av1/encoder/rd.h                                 |  2 +-
 av1/encoder/rdopt.c                              |  2 +-
 av1/encoder/rdopt.h                              |  2 +-
 av1/encoder/rdopt_data_defs.h                    |  2 +-
 av1/encoder/rdopt_utils.h                        |  2 +-
 av1/encoder/reconinter_enc.c                     |  2 +-
 av1/encoder/reconinter_enc.h                     |  2 +-
 av1/encoder/saliency_map.c                       |  2 +-
 av1/encoder/saliency_map.h                       |  2 +-
 av1/encoder/segmentation.c                       |  2 +-
 av1/encoder/segmentation.h                       |  2 +-
 av1/encoder/sorting_network.h                    |  2 +-
 av1/encoder/sparse_linear_solver.c               |  2 +-
 av1/encoder/sparse_linear_solver.h               |  2 +-
 av1/encoder/speed_features.c                     |  2 +-
 av1/encoder/speed_features.h                     |  2 +-
 av1/encoder/superres_scale.c                     |  2 +-
 av1/encoder/superres_scale.h                     |  2 +-
 av1/encoder/svc_layercontext.c                   | 13 +++++++------
 av1/encoder/svc_layercontext.h                   | 15 ++++++++-------
 av1/encoder/temporal_filter.c                    |  2 +-
 av1/encoder/temporal_filter.h                    |  2 +-
 av1/encoder/thirdpass.c                          |  2 +-
 av1/encoder/thirdpass.h                          |  2 +-
 av1/encoder/tokenize.c                           |  2 +-
 av1/encoder/tokenize.h                           |  2 +-
 av1/encoder/tpl_model.c                          |  2 +-
 av1/encoder/tpl_model.h                          |  2 +-
 av1/encoder/tune_butteraugli.c                   |  2 +-
 av1/encoder/tune_butteraugli.h                   |  2 +-
 av1/encoder/tune_vmaf.c                          |  2 +-
 av1/encoder/tune_vmaf.h                          |  2 +-
 av1/encoder/tx_prune_model_weights.h             |  2 +-
 av1/encoder/tx_search.c                          |  2 +-
 av1/encoder/tx_search.h                          |  2 +-
 av1/encoder/txb_rdopt.c                          |  2 +-
 av1/encoder/txb_rdopt.h                          |  2 +-
 av1/encoder/txb_rdopt_utils.h                    |  2 +-
 av1/encoder/var_based_part.c                     |  2 +-
 av1/encoder/var_based_part.h                     |  2 +-
 av1/encoder/wedge_utils.c                        |  2 +-
 av1/encoder/x86/av1_fwd_txfm1d_sse4.c            |  2 +-
 av1/encoder/x86/av1_fwd_txfm2d_avx2.c            |  2 +-
 av1/encoder/x86/av1_fwd_txfm2d_sse4.c            |  2 +-
 av1/encoder/x86/av1_fwd_txfm_avx2.h              |  2 +-
 av1/encoder/x86/av1_fwd_txfm_sse2.c              |  2 +-
 av1/encoder/x86/av1_fwd_txfm_sse2.h              |  2 +-
 av1/encoder/x86/av1_highbd_quantize_avx2.c       |  2 +-
 av1/encoder/x86/av1_highbd_quantize_sse4.c       |  2 +-
 av1/encoder/x86/av1_k_means_avx2.c               |  2 +-
 av1/encoder/x86/av1_k_means_sse2.c               |  2 +-
 av1/encoder/x86/av1_quantize_avx2.c              |  2 +-
 av1/encoder/x86/av1_quantize_sse2.c              |  2 +-
 av1/encoder/x86/av1_temporal_denoiser_sse2.c     |  2 +-
 av1/encoder/x86/av1_txfm1d_sse4.h                |  2 +-
 av1/encoder/x86/cnn_avx2.c                       |  2 +-
 av1/encoder/x86/encodetxb_avx2.c                 |  2 +-
 av1/encoder/x86/encodetxb_sse2.c                 |  2 +-
 av1/encoder/x86/encodetxb_sse4.c                 |  2 +-
 av1/encoder/x86/error_intrin_avx2.c              |  2 +-
 av1/encoder/x86/error_intrin_sse2.c              |  2 +-
 av1/encoder/x86/hash_sse42.c                     |  2 +-
 av1/encoder/x86/highbd_block_error_intrin_avx2.c |  2 +-
 av1/encoder/x86/highbd_block_error_intrin_sse2.c |  2 +-
 av1/encoder/x86/highbd_fwd_txfm_avx2.c           |  2 +-
 av1/encoder/x86/highbd_fwd_txfm_sse4.c           |  2 +-
 av1/encoder/x86/highbd_temporal_filter_avx2.c    |  2 +-
 av1/encoder/x86/highbd_temporal_filter_sse2.c    |  2 +-
 av1/encoder/x86/ml_avx2.c                        |  2 +-
 av1/encoder/x86/ml_sse3.c                        |  2 +-
 av1/encoder/x86/ml_sse3.h                        |  2 +-
 av1/encoder/x86/pickrst_avx2.c                   |  2 +-
 av1/encoder/x86/pickrst_sse4.c                   |  2 +-
 av1/encoder/x86/rdopt_avx2.c                     |  2 +-
 av1/encoder/x86/rdopt_sse4.c                     |  2 +-
 av1/encoder/x86/reconinter_enc_sse2.c            |  2 +-
 av1/encoder/x86/reconinter_enc_ssse3.c           |  2 +-
 av1/encoder/x86/temporal_filter_avx2.c           |  2 +-
 av1/encoder/x86/temporal_filter_sse2.c           |  2 +-
 av1/encoder/x86/wedge_utils_avx2.c               |  2 +-
 av1/encoder/x86/wedge_utils_sse2.c               |  2 +-
 av1/ratectrl_rtc.cc                              |  2 +-
 av1/ratectrl_rtc.h                               |  2 +-
 common/args.c                                    |  2 +-
 common/args.h                                    |  2 +-
 common/args_helper.c                             |  2 +-
 common/args_helper.h                             |  2 +-
 common/av1_config.c                              |  2 +-
 common/av1_config.h                              |  2 +-
 common/ivfdec.c                                  |  2 +-
 common/ivfdec.h                                  |  2 +-
 common/ivfenc.c                                  |  2 +-
 common/ivfenc.h                                  |  2 +-
 common/obudec.c                                  |  2 +-
 common/obudec.h                                  |  2 +-
 common/rawenc.c                                  |  2 +-
 common/rawenc.h                                  |  2 +-
 common/tools_common.c                            |  2 +-
 common/tools_common.h                            |  2 +-
 common/video_common.h                            |  2 +-
 common/video_reader.c                            |  2 +-
 common/video_reader.h                            |  2 +-
 common/video_writer.c                            |  2 +-
 common/video_writer.h                            |  2 +-
 common/warnings.c                                |  2 +-
 common/warnings.h                                |  2 +-
 common/webmdec.cc                                |  2 +-
 common/webmdec.h                                 |  2 +-
 common/webmenc.cc                                |  2 +-
 common/webmenc.h                                 |  2 +-
 common/y4menc.c                                  |  2 +-
 common/y4menc.h                                  |  2 +-
 common/y4minput.c                                |  2 +-
 common/y4minput.h                                |  2 +-
 examples/analyzer.cc                             |  2 +-
 examples/aom_cx_set_ref.c                        |  2 +-
 examples/av1_dec_fuzzer.cc                       |  2 +-
 examples/decode_to_md5.c                         |  2 +-
 examples/decode_with_drops.c                     |  2 +-
 examples/encoder_util.c                          |  2 +-
 examples/encoder_util.h                          |  2 +-
 examples/inspect.c                               |  2 +-
 examples/lightfield_bitstream_parsing.c          |  2 +-
 examples/lightfield_decoder.c                    |  2 +-
 examples/lightfield_encoder.c                    |  2 +-
 examples/lightfield_tile_list_decoder.c          |  2 +-
 examples/lossless_encoder.c                      |  2 +-
 examples/noise_model.c                           |  2 +-
 examples/photon_noise_table.c                    |  2 +-
 examples/scalable_decoder.c                      |  2 +-
 examples/scalable_encoder.c                      |  2 +-
 examples/set_maps.c                              |  2 +-
 examples/simple_decoder.c                        |  2 +-
 examples/simple_encoder.c                        |  2 +-
 examples/svc_encoder_rtc.cc                      | 13 +++++++------
 examples/twopass_encoder.c                       |  2 +-
 stats/aomstats.c                                 |  2 +-
 stats/aomstats.h                                 |  2 +-
 stats/rate_hist.c                                |  2 +-
 stats/rate_hist.h                                |  2 +-
 test/accounting_test.cc                          |  2 +-
 test/acm_random.h                                |  2 +-
 test/active_map_test.cc                          |  2 +-
 test/allintra_end_to_end_test.cc                 |  2 +-
 test/altref_test.cc                              |  2 +-
 test/aom_image_test.cc                           |  2 +-
 test/aom_integer_test.cc                         |  2 +-
 test/aom_mem_test.cc                             |  2 +-
 test/aq_segment_test.cc                          |  2 +-
 test/arf_freq_test.cc                            |  2 +-
 test/av1_common_int_test.cc                      |  2 +-
 test/av1_config_test.cc                          |  2 +-
 test/av1_convolve_scale_test.cc                  |  2 +-
 test/av1_convolve_test.cc                        |  2 +-
 test/av1_encoder_parms_get_to_decoder.cc         |  2 +-
 test/av1_ext_tile_test.cc                        |  2 +-
 test/av1_external_partition_test.cc              |  2 +-
 test/av1_fwd_txfm1d_test.cc                      |  2 +-
 test/av1_fwd_txfm2d_test.cc                      |  2 +-
 test/av1_highbd_iht_test.cc                      |  2 +-
 test/av1_horz_only_frame_superres_test.cc        |  2 +-
 test/av1_inv_txfm1d_test.cc                      |  2 +-
 test/av1_inv_txfm2d_test.cc                      |  2 +-
 test/av1_k_means_test.cc                         |  2 +-
 test/av1_key_value_api_test.cc                   |  2 +-
 test/av1_nn_predict_test.cc                      |  2 +-
 test/av1_quantize_test.cc                        |  2 +-
 test/av1_round_shift_array_test.cc               |  2 +-
 test/av1_softmax_test.cc                         |  2 +-
 test/av1_temporal_denoiser_test.cc               |  2 +-
 test/av1_txfm_test.cc                            |  2 +-
 test/av1_txfm_test.h                             |  2 +-
 test/av1_wedge_utils_test.cc                     |  2 +-
 test/avg_test.cc                                 | 13 +++++++------
 test/avif_progressive_test.cc                    |  2 +-
 test/binary_codes_test.cc                        |  2 +-
 test/blend_a64_mask_1d_test.cc                   |  2 +-
 test/blend_a64_mask_test.cc                      |  2 +-
 test/block_test.cc                               |  2 +-
 test/boolcoder_test.cc                           |  2 +-
 test/borders_test.cc                             |  2 +-
 test/cdef_test.cc                                |  2 +-
 test/cfl_test.cc                                 |  2 +-
 test/cnn_test.cc                                 |  2 +-
 test/codec_factory.h                             |  2 +-
 test/coding_path_sync.cc                         |  2 +-
 test/comp_avg_pred_test.cc                       |  2 +-
 test/comp_avg_pred_test.h                        |  2 +-
 test/comp_mask_pred_test.cc                      |  2 +-
 test/convolve_test.cc                            |  2 +-
 test/corner_match_test.cc                        |  2 +-
 test/cpu_speed_test.cc                           |  2 +-
 test/cpu_used_firstpass_test.cc                  |  2 +-
 test/datarate_test.cc                            |  2 +-
 test/datarate_test.h                             |  2 +-
 test/decode_api_test.cc                          |  2 +-
 test/decode_multithreaded_test.cc                |  2 +-
 test/decode_perf_test.cc                         |  2 +-
 test/decode_scalability_test.cc                  |  2 +-
 test/decode_test_driver.cc                       |  2 +-
 test/decode_test_driver.h                        |  2 +-
 test/deltaq_mode_test.cc                         |  2 +-
 test/disflow_test.cc                             |  2 +-
 test/divu_small_test.cc                          |  2 +-
 test/dr_prediction_test.cc                       |  2 +-
 test/dropframe_encode_test.cc                    |  2 +-
 test/ec_test.cc                                  |  2 +-
 test/encode_api_test.cc                          |  2 +-
 test/encode_perf_test.cc                         |  2 +-
 test/encode_small_width_height_test.cc           |  2 +-
 test/encode_test_driver.cc                       |  2 +-
 test/encode_test_driver.h                        |  2 +-
 test/encodemb_test.cc                            |  2 +-
 test/encodetxb_test.cc                           |  2 +-
 test/end_to_end_psnr_test.cc                     |  2 +-
 test/end_to_end_qmpsnr_test.cc                   |  2 +-
 test/end_to_end_ssim_test.cc                     |  2 +-
 test/error_block_test.cc                         |  2 +-
 test/error_resilience_test.cc                    |  2 +-
 test/ethread_test.cc                             |  2 +-
 test/external_frame_buffer_test.cc               |  2 +-
 test/fdct4x4_test.cc                             |  2 +-
 test/fft_test.cc                                 |  2 +-
 test/film_grain_table_test.cc                    |  2 +-
 test/filterintra_test.cc                         |  2 +-
 test/firstpass_test.cc                           |  2 +-
 test/force_key_frame_test.cc                     |  2 +-
 test/forced_max_frame_width_height_test.cc       |  2 +-
 test/frame_parallel_enc_test.cc                  |  2 +-
 test/frame_resize_test.cc                        |  2 +-
 test/frame_size_tests.cc                         |  2 +-
 test/function_equivalence_test.h                 |  2 +-
 test/fwht4x4_test.cc                             |  2 +-
 test/gf_pyr_height_test.cc                       |  2 +-
 test/hadamard_test.cc                            | 13 +++++++------
 test/hash_test.cc                                |  2 +-
 test/hbd_metrics_test.cc                         |  2 +-
 test/hiprec_convolve_test.cc                     |  2 +-
 test/hiprec_convolve_test_util.cc                |  2 +-
 test/hiprec_convolve_test_util.h                 |  2 +-
 test/horver_correlation_test.cc                  |  2 +-
 test/horz_superres_test.cc                       |  2 +-
 test/i420_video_source.h                         |  2 +-
 test/intra_edge_test.cc                          |  2 +-
 test/intrabc_test.cc                             |  2 +-
 test/intrapred_test.cc                           |  2 +-
 test/invalid_file_test.cc                        |  2 +-
 test/ivf_video_source.h                          |  2 +-
 test/kf_test.cc                                  |  2 +-
 test/level_test.cc                               |  2 +-
 test/log2_test.cc                                |  2 +-
 test/loopfilter_control_test.cc                  |  2 +-
 test/lossless_test.cc                            |  2 +-
 test/lpf_test.cc                                 |  2 +-
 test/masked_sad_test.cc                          |  2 +-
 test/masked_variance_test.cc                     |  2 +-
 test/md5_helper.h                                |  2 +-
 test/metadata_test.cc                            |  2 +-
 test/minmax_test.cc                              |  4 ++--
 test/monochrome_test.cc                          |  2 +-
 test/motion_vector_test.cc                       |  2 +-
 test/mv_cost_test.cc                             |  2 +-
 test/noise_model_test.cc                         |  2 +-
 test/obmc_sad_test.cc                            |  2 +-
 test/obmc_variance_test.cc                       |  2 +-
 test/pickrst_test.cc                             |  2 +-
 test/postproc_filters_test.cc                    |  2 +-
 test/quant_test.cc                               |  2 +-
 test/quantize_func_test.cc                       |  2 +-
 test/ratectrl_rtc_test.cc                        |  2 +-
 test/ratectrl_test.cc                            |  2 +-
 test/rd_test.cc                                  |  2 +-
 test/reconinter_test.cc                          |  2 +-
 test/register_state_check.h                      |  2 +-
 test/resize_test.cc                              |  2 +-
 test/rt_end_to_end_test.cc                       |  2 +-
 test/sad_test.cc                                 |  2 +-
 test/sb_multipass_test.cc                        |  2 +-
 test/sb_qp_sweep_test.cc                         |  2 +-
 test/scalability_test.cc                         |  2 +-
 test/scan_test.cc                                |  2 +-
 test/screen_content_test.cc                      |  2 +-
 test/segment_binarization_sync.cc                |  2 +-
 test/selfguided_filter_test.cc                   |  2 +-
 test/sharpness_test.cc                           |  2 +-
 test/simd_avx2_test.cc                           |  2 +-
 test/simd_cmp_avx2.cc                            |  2 +-
 test/simd_cmp_impl.h                             |  2 +-
 test/simd_cmp_sse2.cc                            |  2 +-
 test/simd_cmp_sse4.cc                            |  2 +-
 test/simd_cmp_ssse3.cc                           |  2 +-
 test/simd_impl.h                                 |  2 +-
 test/simd_sse2_test.cc                           |  2 +-
 test/simd_sse4_test.cc                           |  2 +-
 test/simd_ssse3_test.cc                          |  2 +-
 test/sse_sum_test.cc                             |  2 +-
 test/still_picture_test.cc                       |  2 +-
 test/subtract_test.cc                            |  2 +-
 test/sum_squares_test.cc                         |  2 +-
 test/svc_datarate_test.cc                        |  2 +-
 test/temporal_filter_test.cc                     |  2 +-
 test/test_aom_rc.cc                              |  2 +-
 test/test_intra_pred_speed.cc                    |  2 +-
 test/test_libaom.cc                              |  2 +-
 test/test_vector_test.cc                         |  2 +-
 test/test_vectors.cc                             |  2 +-
 test/test_vectors.h                              |  2 +-
 test/tile_config_test.cc                         |  2 +-
 test/tile_independence_test.cc                   |  2 +-
 test/time_stamp_test.cc                          |  2 +-
 test/tpl_model_test.cc                           |  2 +-
 test/transform_test_base.h                       |  2 +-
 test/util.h                                      |  2 +-
 test/variance_test.cc                            |  2 +-
 test/video_source.h                              |  2 +-
 test/warp_filter_test.cc                         |  2 +-
 test/warp_filter_test_util.cc                    |  2 +-
 test/warp_filter_test_util.h                     |  2 +-
 test/webm_video_source.h                         |  2 +-
 test/webmenc_test.cc                             |  2 +-
 test/wiener_test.cc                              |  2 +-
 test/y4m_test.cc                                 |  2 +-
 test/y4m_video_source.h                          |  2 +-
 test/yuv_video_source.h                          |  2 +-
 tools/aom_entropy_optimizer.c                    |  2 +-
 tools/auto_refactor/c_files/decl_status_code.c   |  2 +-
 tools/auto_refactor/c_files/func_in_out.c        |  2 +-
 tools/auto_refactor/c_files/global_variable.c    |  2 +-
 tools/auto_refactor/c_files/parse_lvalue.c       |  2 +-
 tools/auto_refactor/c_files/simple_code.c        |  2 +-
 tools/auto_refactor/c_files/struct_code.c        |  2 +-
 tools/dump_obu.cc                                |  2 +-
 tools/obu_parser.cc                              |  2 +-
 tools/obu_parser.h                               |  2 +-
 tools/txfm_analyzer/txfm_gen_code.cc             |  2 +-
 tools/txfm_analyzer/txfm_graph.cc                |  2 +-
 tools/txfm_analyzer/txfm_graph.h                 |  2 +-
 990 files changed, 1166 insertions(+), 1139 deletions(-)

diff --git a/aom/aom.h b/aom/aom.h
index 0650a11f6b..0a7ae28187 100644
--- a/aom/aom.h
+++ b/aom/aom.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index d5b8790a98..de22d7fb03 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_decoder.h b/aom/aom_decoder.h
index 229cf7358f..29c757b7fc 100644
--- a/aom/aom_decoder.h
+++ b/aom/aom_decoder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 15cf21b040..bdedf48581 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_external_partition.h b/aom/aom_external_partition.h
index c381f6e5e9..8f748714ea 100644
--- a/aom/aom_external_partition.h
+++ b/aom/aom_external_partition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_frame_buffer.h b/aom/aom_frame_buffer.h
index 0e80373ddd..45ceb33414 100644
--- a/aom/aom_frame_buffer.h
+++ b/aom/aom_frame_buffer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_image.h b/aom/aom_image.h
index 68fb312222..1b790d50ce 100644
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aom_integer.h b/aom/aom_integer.h
index ce65e98452..d0ef9280bb 100644
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aomcx.h b/aom/aomcx.h
index edd8cd5e7c..002b5d37d0 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/aomdx.h b/aom/aomdx.h
index 2dd7bb3375..f18cbbfab9 100644
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h
index b854a889e0..d2af212ee8 100644
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/internal/aom_image_internal.h b/aom/internal/aom_image_internal.h
index 1b04c9ec3f..ef0f166847 100644
--- a/aom/internal/aom_image_internal.h
+++ b/aom/internal/aom_image_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/src/aom_codec.c b/aom/src/aom_codec.c
index 316cc6fd23..ac43842a5c 100644
--- a/aom/src/aom_codec.c
+++ b/aom/src/aom_codec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/src/aom_decoder.c b/aom/src/aom_decoder.c
index 49fff26352..164212abee 100644
--- a/aom/src/aom_decoder.c
+++ b/aom/src/aom_decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/src/aom_encoder.c b/aom/src/aom_encoder.c
index f188567b94..3b028ff49a 100644
--- a/aom/src/aom_encoder.c
+++ b/aom/src/aom_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index c29095cbc5..039a012ee3 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom/src/aom_integer.c b/aom/src/aom_integer.c
index 7edfd0de87..347222fd67 100644
--- a/aom/src/aom_integer.c
+++ b/aom/src/aom_integer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 254f6401c7..2c8a632f74 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index 85dc0052e2..ed82e56129 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_dsp_rtcd.c b/aom_dsp/aom_dsp_rtcd.c
index 0265dd1ee5..686f16c7c1 100644
--- a/aom_dsp/aom_dsp_rtcd.c
+++ b/aom_dsp/aom_dsp_rtcd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_filter.h b/aom_dsp/aom_filter.h
index 00686ac388..8f1af2ca7b 100644
--- a/aom_dsp/aom_filter.h
+++ b/aom_dsp/aom_filter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index 69da8f21b4..3496d93a25 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/aom_simd_inline.h b/aom_dsp/aom_simd_inline.h
index b4b1b35637..41c29f6b00 100644
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 193844d06c..229d58c483 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2014 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index b523c41bc3..0b6e5245a4 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 7219570860..04b3832b63 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2014 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 34bfe01663..4c8a6cdeee 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2014 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 325d6f29ff..b90b1bd0e1 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/aom_filter.h b/aom_dsp/arm/aom_filter.h
index 9972d064fc..2573dd803d 100644
--- a/aom_dsp/arm/aom_filter.h
+++ b/aom_dsp/arm/aom_filter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/aom_neon_sve2_bridge.h b/aom_dsp/arm/aom_neon_sve2_bridge.h
index 6e7d2d6365..5631fcfd11 100644
--- a/aom_dsp/arm/aom_neon_sve2_bridge.h
+++ b/aom_dsp/arm/aom_neon_sve2_bridge.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_
diff --git a/aom_dsp/arm/aom_neon_sve_bridge.h b/aom_dsp/arm/aom_neon_sve_bridge.h
index 3da80e22ba..57650acd51 100644
--- a/aom_dsp/arm/aom_neon_sve_bridge.h
+++ b/aom_dsp/arm/aom_neon_sve_bridge.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 2e79b2ef69..7d6054cb0a 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index b17f7fca7f..4731d3edf4 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/avg_sve.c b/aom_dsp/arm/avg_sve.c
index 57a546501a..a9944dfd56 100644
--- a/aom_dsp/arm/avg_sve.c
+++ b/aom_dsp/arm/avg_sve.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 48ff683e96..95f85fcaaa 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
index c8a03224e4..285dad7e39 100644
--- a/aom_dsp/arm/blend_neon.h
+++ b/aom_dsp/arm/blend_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
index f2ada93e95..3275406a83 100644
--- a/aom_dsp/arm/blk_sse_sum_neon.c
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/blk_sse_sum_sve.c b/aom_dsp/arm/blk_sse_sum_sve.c
index f538346d8b..399b4415e4 100644
--- a/aom_dsp/arm/blk_sse_sum_sve.c
+++ b/aom_dsp/arm/blk_sse_sum_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
index 19c9b04c57..28fe81ce71 100644
--- a/aom_dsp/arm/dist_wtd_avg_neon.h
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index c87acfb86f..96c7915740 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index d0f59227db..ef2cf4fdc7 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/highbd_avg_neon.c b/aom_dsp/arm/highbd_avg_neon.c
index 47d5dae012..1f5ffe1dc1 100644
--- a/aom_dsp/arm/highbd_avg_neon.c
+++ b/aom_dsp/arm/highbd_avg_neon.c
@@ -1,6 +1,6 @@
 /*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  *  This source code is subject to the terms of the BSD 2 Clause License and
  *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_avg_pred_neon.c b/aom_dsp/arm/highbd_avg_pred_neon.c
index 531309b025..3ef1561ac0 100644
--- a/aom_dsp/arm/highbd_avg_pred_neon.c
+++ b/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
index 8b03e91ac3..1cecd5512c 100644
--- a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
index 90b44fcc5e..656cbe4ccf 100644
--- a/aom_dsp/arm/highbd_blend_a64_mask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
index 1292e20342..a69d1e1bf2 100644
--- a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index 99ad0ba601..d1413b6402 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2014 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index b87b4bad84..9c18f135f8 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index f519395e81..a7d5ad8b73 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index d28617c67e..9c576f2ded 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -1,6 +1,6 @@
 /*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  *  This source code is subject to the terms of the BSD 2 Clause License and
  *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index eff773b85a..3eda2ca462 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 77727b7665..9f38bccee8 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
index 9262d818e9..89dda4c100 100644
--- a/aom_dsp/arm/highbd_masked_sad_neon.c
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
index 28699e6f41..2adf1dedca 100644
--- a/aom_dsp/arm/highbd_obmc_sad_neon.c
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
index d59224619b..9088cf5a68 100644
--- a/aom_dsp/arm/highbd_obmc_variance_neon.c
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index b3514296af..f4e8e6c524 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index d51f639de6..7de38e544f 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_sadxd_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
index 85ca6732a8..f4f2b77b49 100644
--- a/aom_dsp/arm/highbd_sadxd_neon.c
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
index 184e9f9bef..3d9b07a9f5 100644
--- a/aom_dsp/arm/highbd_sse_neon.c
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/highbd_sse_sve.c b/aom_dsp/arm/highbd_sse_sve.c
index 9ea13ab67a..c2ad589beb 100644
--- a/aom_dsp/arm/highbd_sse_sve.c
+++ b/aom_dsp/arm/highbd_sse_sve.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c
index 686fa5f226..75be2bb2d5 100644
--- a/aom_dsp/arm/highbd_subpel_variance_neon.c
+++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 18b8efff4c..984780d25c 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
index d56ae97571..737f8e5b11 100644
--- a/aom_dsp/arm/highbd_variance_neon_dotprod.c
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/highbd_variance_sve.c b/aom_dsp/arm/highbd_variance_sve.c
index ad1f55e367..2403832d28 100644
--- a/aom_dsp/arm/highbd_variance_sve.c
+++ b/aom_dsp/arm/highbd_variance_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index f024c4fe53..3c12ca3c19 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 7c64be1253..c54ae64a1e 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index 8f65b805ec..c3afc55bfe 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 9d263105e3..7f84acf041 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index fa571a68c3..41efd03632 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/obmc_sad_neon.c b/aom_dsp/arm/obmc_sad_neon.c
index a692cbb388..61b01f2aaa 100644
--- a/aom_dsp/arm/obmc_sad_neon.c
+++ b/aom_dsp/arm/obmc_sad_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/obmc_variance_neon.c b/aom_dsp/arm/obmc_variance_neon.c
index 50cd5f3b6a..95b364cfc3 100644
--- a/aom_dsp/arm/obmc_variance_neon.c
+++ b/aom_dsp/arm/obmc_variance_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/reinterpret_neon.h b/aom_dsp/arm/reinterpret_neon.h
index f9702513ad..c3951183f4 100644
--- a/aom_dsp/arm/reinterpret_neon.h
+++ b/aom_dsp/arm/reinterpret_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 46a1666331..ef19908518 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
index 5504c6838e..d2bc0cc872 100644
--- a/aom_dsp/arm/sad_neon_dotprod.c
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sadxd_neon.c b/aom_dsp/arm/sadxd_neon.c
index e89e1c5a73..69f408456c 100644
--- a/aom_dsp/arm/sadxd_neon.c
+++ b/aom_dsp/arm/sadxd_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sadxd_neon_dotprod.c b/aom_dsp/arm/sadxd_neon_dotprod.c
index 3d11d1cb96..4f9d408847 100644
--- a/aom_dsp/arm/sadxd_neon_dotprod.c
+++ b/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index ec8f0ee183..b0ed8330f7 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/sse_neon_dotprod.c b/aom_dsp/arm/sse_neon_dotprod.c
index 979049780b..f9562fc930 100644
--- a/aom_dsp/arm/sse_neon_dotprod.c
+++ b/aom_dsp/arm/sse_neon_dotprod.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 2e6e738853..d365cef830 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/subtract_neon.c b/aom_dsp/arm/subtract_neon.c
index 01ae835be0..b093e32739 100644
--- a/aom_dsp/arm/subtract_neon.c
+++ b/aom_dsp/arm/subtract_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 30a108e70a..a497979574 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 424b2b4445..2f5e4b9cad 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sum_squares_neon_dotprod.c b/aom_dsp/arm/sum_squares_neon_dotprod.c
index 44462a693c..2f6d1a566a 100644
--- a/aom_dsp/arm/sum_squares_neon_dotprod.c
+++ b/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/sum_squares_sve.c b/aom_dsp/arm/sum_squares_sve.c
index c7e6dfcb02..0d132dbd0d 100644
--- a/aom_dsp/arm/sum_squares_sve.c
+++ b/aom_dsp/arm/sum_squares_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 9fc4fb075a..5ac287fce7 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 9e4e8c0cf0..ae1ad423ce 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/arm/variance_neon_dotprod.c b/aom_dsp/arm/variance_neon_dotprod.c
index 9fb52e1df7..fcb80ad6de 100644
--- a/aom_dsp/arm/variance_neon_dotprod.c
+++ b/aom_dsp/arm/variance_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 893f9c2f65..5cba8a8601 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/binary_codes_reader.c b/aom_dsp/binary_codes_reader.c
index ee0ce62278..3af1c18642 100644
--- a/aom_dsp/binary_codes_reader.c
+++ b/aom_dsp/binary_codes_reader.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/binary_codes_reader.h b/aom_dsp/binary_codes_reader.h
index d218f0619f..8ef16eb11c 100644
--- a/aom_dsp/binary_codes_reader.h
+++ b/aom_dsp/binary_codes_reader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/binary_codes_writer.c b/aom_dsp/binary_codes_writer.c
index 55ce8429d7..c7722283b1 100644
--- a/aom_dsp/binary_codes_writer.c
+++ b/aom_dsp/binary_codes_writer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/binary_codes_writer.h b/aom_dsp/binary_codes_writer.h
index 5ec8662139..99524d2df2 100644
--- a/aom_dsp/binary_codes_writer.h
+++ b/aom_dsp/binary_codes_writer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitreader.c b/aom_dsp/bitreader.c
index 4c70a91712..2c63c99af0 100644
--- a/aom_dsp/bitreader.c
+++ b/aom_dsp/bitreader.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index 29321f916e..b27bc5913f 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitreader_buffer.c b/aom_dsp/bitreader_buffer.c
index d79feea6a3..1d8a7d31cd 100644
--- a/aom_dsp/bitreader_buffer.c
+++ b/aom_dsp/bitreader_buffer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitreader_buffer.h b/aom_dsp/bitreader_buffer.h
index 359fbe5194..f9f42b295a 100644
--- a/aom_dsp/bitreader_buffer.h
+++ b/aom_dsp/bitreader_buffer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitwriter.c b/aom_dsp/bitwriter.c
index 4c27bb1fc3..36f43da3b6 100644
--- a/aom_dsp/bitwriter.c
+++ b/aom_dsp/bitwriter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 6aedd8ceb9..3c566889a4 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitwriter_buffer.c b/aom_dsp/bitwriter_buffer.c
index 7d0ab9486a..e6e03ccfee 100644
--- a/aom_dsp/bitwriter_buffer.c
+++ b/aom_dsp/bitwriter_buffer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/bitwriter_buffer.h b/aom_dsp/bitwriter_buffer.h
index fd10e01bb7..1d61e86110 100644
--- a/aom_dsp/bitwriter_buffer.h
+++ b/aom_dsp/bitwriter_buffer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/blend.h b/aom_dsp/blend.h
index fd87dc1810..6585865400 100644
--- a/aom_dsp/blend.h
+++ b/aom_dsp/blend.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/blend_a64_hmask.c b/aom_dsp/blend_a64_hmask.c
index e9e38ef969..596d9f7b2d 100644
--- a/aom_dsp/blend_a64_hmask.c
+++ b/aom_dsp/blend_a64_hmask.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index 35017fd737..2cab9e0df0 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/blend_a64_vmask.c b/aom_dsp/blend_a64_vmask.c
index c938bb33af..e2545bc166 100644
--- a/aom_dsp/blend_a64_vmask.c
+++ b/aom_dsp/blend_a64_vmask.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/blk_sse_sum.c b/aom_dsp/blk_sse_sum.c
index d76c3f87b9..5bc2c4bddc 100644
--- a/aom_dsp/blk_sse_sum.c
+++ b/aom_dsp/blk_sse_sum.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/butteraugli.c b/aom_dsp/butteraugli.c
index 8d2a29f7a3..b373e12f95 100644
--- a/aom_dsp/butteraugli.c
+++ b/aom_dsp/butteraugli.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/butteraugli.h b/aom_dsp/butteraugli.h
index 5304092ccb..0ce9def692 100644
--- a/aom_dsp/butteraugli.h
+++ b/aom_dsp/butteraugli.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entcode.c b/aom_dsp/entcode.c
index aad96c6fc6..53a7503cdc 100644
--- a/aom_dsp/entcode.c
+++ b/aom_dsp/entcode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entcode.h b/aom_dsp/entcode.h
index 526ca598d3..9f9127bb25 100644
--- a/aom_dsp/entcode.h
+++ b/aom_dsp/entcode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entdec.c b/aom_dsp/entdec.c
index 5bbcddae08..5a29cfac6f 100644
--- a/aom_dsp/entdec.c
+++ b/aom_dsp/entdec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entdec.h b/aom_dsp/entdec.h
index c746167775..7ef4e0dfcd 100644
--- a/aom_dsp/entdec.h
+++ b/aom_dsp/entdec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c
index 591e0ad214..23d9ad0ca6 100644
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 1a38affb4f..c52088b843 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/fastssim.c b/aom_dsp/fastssim.c
index 0ef0590e89..263ff41ef1 100644
--- a/aom_dsp/fastssim.c
+++ b/aom_dsp/fastssim.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
index a44dbf77b1..cd3d2f77eb 100644
--- a/aom_dsp/fft.c
+++ b/aom_dsp/fft.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
index 3de1a045ee..3f7e03cf8e 100644
--- a/aom_dsp/fft_common.h
+++ b/aom_dsp/fft_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c
index 5758d2887f..e539e76322 100644
--- a/aom_dsp/flow_estimation/arm/disflow_neon.c
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.h b/aom_dsp/flow_estimation/arm/disflow_neon.h
index d991a13460..80827da66d 100644
--- a/aom_dsp/flow_estimation/arm/disflow_neon.h
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/arm/disflow_sve.c b/aom_dsp/flow_estimation/arm/disflow_sve.c
index 7b01e90d12..f399843b16 100644
--- a/aom_dsp/flow_estimation/arm/disflow_sve.c
+++ b/aom_dsp/flow_estimation/arm/disflow_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/corner_detect.c b/aom_dsp/flow_estimation/corner_detect.c
index 44d423dcdf..b8b7142972 100644
--- a/aom_dsp/flow_estimation/corner_detect.c
+++ b/aom_dsp/flow_estimation/corner_detect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/corner_detect.h b/aom_dsp/flow_estimation/corner_detect.h
index 54d94309ed..7d7f09a5d6 100644
--- a/aom_dsp/flow_estimation/corner_detect.h
+++ b/aom_dsp/flow_estimation/corner_detect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/corner_match.c b/aom_dsp/flow_estimation/corner_match.c
index c78edb8910..07ea924e8e 100644
--- a/aom_dsp/flow_estimation/corner_match.c
+++ b/aom_dsp/flow_estimation/corner_match.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/corner_match.h b/aom_dsp/flow_estimation/corner_match.h
index 77ebee2ea3..36af17b81e 100644
--- a/aom_dsp/flow_estimation/corner_match.h
+++ b/aom_dsp/flow_estimation/corner_match.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c
index f511a6eb49..1e5a675654 100644
--- a/aom_dsp/flow_estimation/disflow.c
+++ b/aom_dsp/flow_estimation/disflow.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/disflow.h b/aom_dsp/flow_estimation/disflow.h
index ac3680004d..f65821cf07 100644
--- a/aom_dsp/flow_estimation/disflow.h
+++ b/aom_dsp/flow_estimation/disflow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/flow_estimation.c b/aom_dsp/flow_estimation/flow_estimation.c
index 96624eb863..7d32d03434 100644
--- a/aom_dsp/flow_estimation/flow_estimation.c
+++ b/aom_dsp/flow_estimation/flow_estimation.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/flow_estimation.h b/aom_dsp/flow_estimation/flow_estimation.h
index a38b03fc4e..a31d0b8cb3 100644
--- a/aom_dsp/flow_estimation/flow_estimation.h
+++ b/aom_dsp/flow_estimation/flow_estimation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/ransac.c b/aom_dsp/flow_estimation/ransac.c
index 7c7bebdda4..a14d225637 100644
--- a/aom_dsp/flow_estimation/ransac.c
+++ b/aom_dsp/flow_estimation/ransac.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/ransac.h b/aom_dsp/flow_estimation/ransac.h
index 0529b6e13c..c6eb5c71c6 100644
--- a/aom_dsp/flow_estimation/ransac.h
+++ b/aom_dsp/flow_estimation/ransac.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/aom_dsp/flow_estimation/x86/corner_match_avx2.c
index ff69ae75f5..213dcfe102 100644
--- a/aom_dsp/flow_estimation/x86/corner_match_avx2.c
+++ b/aom_dsp/flow_estimation/x86/corner_match_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/aom_dsp/flow_estimation/x86/corner_match_sse4.c
index bff7db6d2f..40a42aaca9 100644
--- a/aom_dsp/flow_estimation/x86/corner_match_sse4.c
+++ b/aom_dsp/flow_estimation/x86/corner_match_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c
index ad5a1bd7c6..7806100ebd 100644
--- a/aom_dsp/flow_estimation/x86/disflow_avx2.c
+++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/flow_estimation/x86/disflow_sse4.c b/aom_dsp/flow_estimation/x86/disflow_sse4.c
index e0a4bd040c..3743b88dfc 100644
--- a/aom_dsp/flow_estimation/x86/disflow_sse4.c
+++ b/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index 5503501d62..0cfa6d440b 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/grain_params.h b/aom_dsp/grain_params.h
index 5a28afc2a1..2a043f7b84 100644
--- a/aom_dsp/grain_params.h
+++ b/aom_dsp/grain_params.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/grain_table.c b/aom_dsp/grain_table.c
index 3505f9f2c8..3ca875658b 100644
--- a/aom_dsp/grain_table.c
+++ b/aom_dsp/grain_table.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/grain_table.h b/aom_dsp/grain_table.h
index 49e84980ee..7d58f8447d 100644
--- a/aom_dsp/grain_table.h
+++ b/aom_dsp/grain_table.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 6ec091f5f3..8a9dbae083 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/intrapred_common.h b/aom_dsp/intrapred_common.h
index 6172224be1..7c74269334 100644
--- a/aom_dsp/intrapred_common.h
+++ b/aom_dsp/intrapred_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 075f13689c..3b8352aff0 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/mathutils.h b/aom_dsp/mathutils.h
index 26635fc4d1..45ea6d9b14 100644
--- a/aom_dsp/mathutils.h
+++ b/aom_dsp/mathutils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 947dfd3c7a..b01861d765 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h
index 5b2d7efe29..b332183457 100644
--- a/aom_dsp/noise_model.h
+++ b/aom_dsp/noise_model.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/noise_util.c b/aom_dsp/noise_util.c
index 3ded8cb099..99e222ad4e 100644
--- a/aom_dsp/noise_util.c
+++ b/aom_dsp/noise_util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/noise_util.h b/aom_dsp/noise_util.h
index 2284a171a4..a10dbc2d59 100644
--- a/aom_dsp/noise_util.h
+++ b/aom_dsp/noise_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/odintrin.c b/aom_dsp/odintrin.c
index eb6d8d8771..61e6e56124 100644
--- a/aom_dsp/odintrin.c
+++ b/aom_dsp/odintrin.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/odintrin.h b/aom_dsp/odintrin.h
index 9e4ba5029a..bb0f237684 100644
--- a/aom_dsp/odintrin.h
+++ b/aom_dsp/odintrin.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 5711a40a40..37d8042b75 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index cf0de29945..b174c1e92f 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index afe6e08856..f4fd1d6418 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/psnrhvs.c b/aom_dsp/psnrhvs.c
index 966ba007ed..a854e2c273 100644
--- a/aom_dsp/psnrhvs.c
+++ b/aom_dsp/psnrhvs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index 05ddbb2f5f..e056166d87 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/pyramid.h b/aom_dsp/pyramid.h
index 745bb7e525..d73a37f4c5 100644
--- a/aom_dsp/pyramid.h
+++ b/aom_dsp/pyramid.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index e5c960b826..2d547e33d9 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index efe253ddb9..98c656974f 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/recenter.h b/aom_dsp/recenter.h
index b3fd412907..f36c9cba87 100644
--- a/aom_dsp/recenter.h
+++ b/aom_dsp/recenter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 8d69e3bf1c..0e9d900e5e 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index f3d5847bd5..43035d79bc 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v128_intrinsics.h b/aom_dsp/simd/v128_intrinsics.h
index 218a7a6186..76c3f9b89a 100644
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index f5ca817fb6..e28dbfa8d9 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index d20f979dd9..6e64910423 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
index 17e36eed61..c07ec491e5 100644
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index 60d0d53f6f..20d2709aba 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index 493130df83..226c3f48bf 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 894ddee167..5d0749455b 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v64_intrinsics.h b/aom_dsp/simd/v64_intrinsics.h
index 7079949cd8..6b36ad4d3c 100644
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index bfd6fe0710..adb0411283 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index ec27a6bf42..f052c9fb7e 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/sse.c b/aom_dsp/sse.c
index bfe76edc39..03d4da4aab 100644
--- a/aom_dsp/sse.c
+++ b/aom_dsp/sse.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/ssim.c b/aom_dsp/ssim.c
index 35d493b038..c6edbcb9c9 100644
--- a/aom_dsp/ssim.c
+++ b/aom_dsp/ssim.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/ssim.h b/aom_dsp/ssim.h
index fb92556a8c..f72be1a9f4 100644
--- a/aom_dsp/ssim.h
+++ b/aom_dsp/ssim.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/subtract.c b/aom_dsp/subtract.c
index 4f47e553d4..7e8250cc4d 100644
--- a/aom_dsp/subtract.c
+++ b/aom_dsp/subtract.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/sum_squares.c b/aom_dsp/sum_squares.c
index f58defaa11..f0afccc91d 100644
--- a/aom_dsp/sum_squares.c
+++ b/aom_dsp/sum_squares.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index 67d9e90ca9..7152732bef 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 6cdd58492a..27587cd1fb 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index 6603d312b8..ceaf16ed8e 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/vmaf.c b/aom_dsp/vmaf.c
index a40e00cb23..239b62b36c 100644
--- a/aom_dsp/vmaf.c
+++ b/aom_dsp/vmaf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/vmaf.h b/aom_dsp/vmaf.h
index b539cf8b76..f2fb3b042f 100644
--- a/aom_dsp/vmaf.h
+++ b/aom_dsp/vmaf.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/adaptive_quantize_avx2.c b/aom_dsp/x86/adaptive_quantize_avx2.c
index b3dede75d5..b93e12c184 100644
--- a/aom_dsp/x86/adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/adaptive_quantize_sse2.c b/aom_dsp/x86/adaptive_quantize_sse2.c
index 503b9b4682..85af26ecf5 100644
--- a/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
index a1043828fe..bdbb4c16e9 100644
--- a/aom_dsp/x86/aom_convolve_copy_avx2.c
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <immintrin.h>
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index e78845e97c..887adde962 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <immintrin.h>
diff --git a/aom_dsp/x86/aom_quantize_avx.c b/aom_dsp/x86/aom_quantize_avx.c
index b2d6d4b76d..1b6ea48c8f 100644
--- a/aom_dsp/x86/aom_quantize_avx.c
+++ b/aom_dsp/x86/aom_quantize_avx.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 22f2e696d3..1f382d110b 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 245fda1e94..7bc88ebf5f 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 49fcd72098..6e943b84b3 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 7ff2801026..f7b133c0c2 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/avg_intrin_sse4.c b/aom_dsp/x86/avg_intrin_sse4.c
index b83b43122a..8b2558c1b7 100644
--- a/aom_dsp/x86/avg_intrin_sse4.c
+++ b/aom_dsp/x86/avg_intrin_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/bitdepth_conversion_avx2.h b/aom_dsp/x86/bitdepth_conversion_avx2.h
index 85896e2768..9b2b2b01ee 100644
--- a/aom_dsp/x86/bitdepth_conversion_avx2.h
+++ b/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/bitdepth_conversion_sse2.h b/aom_dsp/x86/bitdepth_conversion_sse2.h
index ff77760b6f..7b634b2839 100644
--- a/aom_dsp/x86/bitdepth_conversion_sse2.h
+++ b/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_a64_hmask_sse4.c b/aom_dsp/x86/blend_a64_hmask_sse4.c
index e0289abe12..a7d212e340 100644
--- a/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index dfbab324d0..638c378b42 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 9a10e86ae5..df0fada68b 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_a64_vmask_sse4.c b/aom_dsp/x86/blend_a64_vmask_sse4.c
index 75fb1c5a94..484d3d08e4 100644
--- a/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_mask_sse4.h b/aom_dsp/x86/blend_mask_sse4.h
index c071fdcfc4..e7b160e41a 100644
--- a/aom_dsp/x86/blend_mask_sse4.h
+++ b/aom_dsp/x86/blend_mask_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blend_sse4.h b/aom_dsp/x86/blend_sse4.h
index 8d9b325101..28e531103e 100644
--- a/aom_dsp/x86/blend_sse4.h
+++ b/aom_dsp/x86/blend_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blk_sse_sum_avx2.c b/aom_dsp/x86/blk_sse_sum_avx2.c
index fdf7de3f4c..7169607c09 100644
--- a/aom_dsp/x86/blk_sse_sum_avx2.c
+++ b/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/blk_sse_sum_sse2.c b/aom_dsp/x86/blk_sse_sum_sse2.c
index bf89427872..8b816d6818 100644
--- a/aom_dsp/x86/blk_sse_sum_sse2.c
+++ b/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/common_avx2.h b/aom_dsp/x86/common_avx2.h
index 96fe4ebb67..2f40dbbee9 100644
--- a/aom_dsp/x86/common_avx2.h
+++ b/aom_dsp/x86/common_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index 4ca214f469..591817b931 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index f5a382ce4e..6658b2243d 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 9e8662af46..094229d484 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve_sse2.h b/aom_dsp/x86/convolve_sse2.h
index 36b7d62b98..7c25a00011 100644
--- a/aom_dsp/x86/convolve_sse2.h
+++ b/aom_dsp/x86/convolve_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve_sse4_1.h b/aom_dsp/x86/convolve_sse4_1.h
index b1a3bb4664..33b1b83af8 100644
--- a/aom_dsp/x86/convolve_sse4_1.h
+++ b/aom_dsp/x86/convolve_sse4_1.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/convolve_ssse3.h b/aom_dsp/x86/convolve_ssse3.h
index b1abead146..288468b1f0 100644
--- a/aom_dsp/x86/convolve_ssse3.h
+++ b/aom_dsp/x86/convolve_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fft_avx2.c b/aom_dsp/x86/fft_avx2.c
index 3f5a9bbeff..5b3eab1d48 100644
--- a/aom_dsp/x86/fft_avx2.c
+++ b/aom_dsp/x86/fft_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
index bdd235bcd3..f73897acec 100644
--- a/aom_dsp/x86/fft_sse2.c
+++ b/aom_dsp/x86/fft_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h
index e1db3b950c..2239a968ac 100644
--- a/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fwd_txfm_sse2.c b/aom_dsp/x86/fwd_txfm_sse2.c
index 0e4fb80468..64e654b064 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/aom_dsp/x86/fwd_txfm_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 78ea98522e..3e418581c6 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
index 05c87bcff9..a07585f2a5 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
index ae31116e9d..333e9f6995 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index 11e45778c0..8a234b8a29 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_convolve_sse2.c b/aom_dsp/x86/highbd_convolve_sse2.c
index 40201aa193..2f2b413d0a 100644
--- a/aom_dsp/x86/highbd_convolve_sse2.c
+++ b/aom_dsp/x86/highbd_convolve_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c
index 31c3c31b3c..e6cc02c7ca 100644
--- a/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_intrapred_sse2.c b/aom_dsp/x86/highbd_intrapred_sse2.c
index 6a2e915ed7..df8f6725de 100644
--- a/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c
index c954da94e5..cd9f414ad2 100644
--- a/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index ea7dc6a9e5..cc0bcd991d 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index 950465cf46..5ae3f90153 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 3b0c42c4f5..d0b5bd1226 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index 8b3045a610..8fb08b30ed 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_subtract_sse2.c b/aom_dsp/x86/highbd_subtract_sse2.c
index 3c3253bdf9..078737a47f 100644
--- a/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/aom_dsp/x86/highbd_subtract_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 21e9e8b282..e8e538b754 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 2fc2e1c0dd..676208bfcf 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_variance_sse4.c b/aom_dsp/x86/highbd_variance_sse4.c
index df5449a9df..24bf1daebc 100644
--- a/aom_dsp/x86/highbd_variance_sse4.c
+++ b/aom_dsp/x86/highbd_variance_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 242a548df9..4a7b862f32 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 61e29731c4..98d9b88d0c 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c
index 9de8bf3c0f..ebe772447e 100644
--- a/aom_dsp/x86/intrapred_sse4.c
+++ b/aom_dsp/x86/intrapred_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 869f880bda..320e6b893d 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_utils.h b/aom_dsp/x86/intrapred_utils.h
index 502574673e..1cc38f7175 100644
--- a/aom_dsp/x86/intrapred_utils.h
+++ b/aom_dsp/x86/intrapred_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_x86.h b/aom_dsp/x86/intrapred_x86.h
index b13f575a76..f0b3ec6614 100644
--- a/aom_dsp/x86/intrapred_x86.h
+++ b/aom_dsp/x86/intrapred_x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/jnt_sad_sse2.c b/aom_dsp/x86/jnt_sad_sse2.c
index 16d2f4be7f..8559329ed4 100644
--- a/aom_dsp/x86/jnt_sad_sse2.c
+++ b/aom_dsp/x86/jnt_sad_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index ed5b580b73..5ca896be93 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/loopfilter_avx2.c b/aom_dsp/x86/loopfilter_avx2.c
index 6e77742e3c..bfcde46419 100644
--- a/aom_dsp/x86/loopfilter_avx2.c
+++ b/aom_dsp/x86/loopfilter_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index cdf24c332a..3b3f56c61a 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/lpf_common_sse2.h b/aom_dsp/x86/lpf_common_sse2.h
index 45464e80b1..6be06d5227 100644
--- a/aom_dsp/x86/lpf_common_sse2.h
+++ b/aom_dsp/x86/lpf_common_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index d96a9dd23d..d2181a5a97 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index f3751c7cb0..8800af7a46 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index df3a8764e3..0c75a8be92 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.h b/aom_dsp/x86/masked_sad_intrin_ssse3.h
index cffbd9672c..fa25910c41 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 0bf383fffd..e23faef7ad 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.h b/aom_dsp/x86/masked_variance_intrin_ssse3.h
index 4faa098ace..c25e5b8523 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 085a572cb1..343f12f555 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index fbed23596c..0962e75e04 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_intrinsic_ssse3.h b/aom_dsp/x86/obmc_intrinsic_ssse3.h
index 27398ffd62..8a4af4bc54 100644
--- a/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c
index 9d1b7d4968..471afd28f4 100644
--- a/aom_dsp/x86/obmc_sad_avx2.c
+++ b/aom_dsp/x86/obmc_sad_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c
index 542572c761..ba0f1a0b73 100644
--- a/aom_dsp/x86/obmc_sad_sse4.c
+++ b/aom_dsp/x86/obmc_sad_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_variance_avx2.c b/aom_dsp/x86/obmc_variance_avx2.c
index c23d8c4eb0..e33238556a 100644
--- a/aom_dsp/x86/obmc_variance_avx2.c
+++ b/aom_dsp/x86/obmc_variance_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index 164d0c28c9..8e55ed0695 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/quantize_avx2.c b/aom_dsp/x86/quantize_avx2.c
index b808d46778..ef9a0fdb62 100644
--- a/aom_dsp/x86/quantize_avx2.c
+++ b/aom_dsp/x86/quantize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/quantize_sse2.c b/aom_dsp/x86/quantize_sse2.c
index ebef1fbac2..ca869e2803 100644
--- a/aom_dsp/x86/quantize_sse2.c
+++ b/aom_dsp/x86/quantize_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/quantize_ssse3.c b/aom_dsp/x86/quantize_ssse3.c
index 25980a055a..ce7adca17b 100644
--- a/aom_dsp/x86/quantize_ssse3.c
+++ b/aom_dsp/x86/quantize_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/quantize_x86.h b/aom_dsp/x86/quantize_x86.h
index 5b040a278a..a795e281cc 100644
--- a/aom_dsp/x86/quantize_x86.h
+++ b/aom_dsp/x86/quantize_x86.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 0fea6ddfd3..46fc1747b4 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index 24cea76b37..9a6c24a6e3 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sad_impl_avx2.c b/aom_dsp/x86/sad_impl_avx2.c
index c5da6e9ab3..0d1b5ab876 100644
--- a/aom_dsp/x86/sad_impl_avx2.c
+++ b/aom_dsp/x86/sad_impl_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sse_avx2.c b/aom_dsp/x86/sse_avx2.c
index c5a5f5c234..6bcca06990 100644
--- a/aom_dsp/x86/sse_avx2.c
+++ b/aom_dsp/x86/sse_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sse_sse4.c b/aom_dsp/x86/sse_sse4.c
index 7e74554d75..e875d56f8f 100644
--- a/aom_dsp/x86/sse_sse4.c
+++ b/aom_dsp/x86/sse_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/subtract_avx2.c b/aom_dsp/x86/subtract_avx2.c
index b4c5cc7c7b..4684206cd4 100644
--- a/aom_dsp/x86/subtract_avx2.c
+++ b/aom_dsp/x86/subtract_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index c748a7dcce..7ae58eef59 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 6c34c44317..fe3a435cf4 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sum_squares_sse2.h b/aom_dsp/x86/sum_squares_sse2.h
index 5ed3f2c7bf..38974f1462 100644
--- a/aom_dsp/x86/sum_squares_sse2.h
+++ b/aom_dsp/x86/sum_squares_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index ae889ad169..ddaa4fea5f 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index d78f4e6f98..2a130ef7f6 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/transpose_sse2.h b/aom_dsp/x86/transpose_sse2.h
index 9dab750f44..dbf476f4cc 100644
--- a/aom_dsp/x86/transpose_sse2.h
+++ b/aom_dsp/x86/transpose_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 4105250bc0..15403b9612 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/txfm_common_sse2.h b/aom_dsp/x86/txfm_common_sse2.h
index 9c99eb93bd..75f55aa1fb 100644
--- a/aom_dsp/x86/txfm_common_sse2.h
+++ b/aom_dsp/x86/txfm_common_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 0f872fc392..06f11f3c9b 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index 57a1cee781..39e3fcf14f 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_impl_ssse3.c b/aom_dsp/x86/variance_impl_ssse3.c
index 952cca1aab..feb41a8c06 100644
--- a/aom_dsp/x86/variance_impl_ssse3.c
+++ b/aom_dsp/x86/variance_impl_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_impl_ssse3.h b/aom_dsp/x86/variance_impl_ssse3.h
index 725b551c5c..3f38479c43 100644
--- a/aom_dsp/x86/variance_impl_ssse3.h
+++ b/aom_dsp/x86/variance_impl_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 610695af97..5f3f899383 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
index d616f43fdf..c95c7d8fec 100644
--- a/aom_dsp/x86/variance_ssse3.c
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_mem/aom_mem.c b/aom_mem/aom_mem.c
index 807ddcf05e..f4126e34bd 100644
--- a/aom_mem/aom_mem.c
+++ b/aom_mem/aom_mem.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_mem/aom_mem.h b/aom_mem/aom_mem.h
index ca4af7fc61..cab6afe894 100644
--- a/aom_mem/aom_mem.h
+++ b/aom_mem/aom_mem.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_mem/include/aom_mem_intrnl.h b/aom_mem/include/aom_mem_intrnl.h
index 2c9819de92..ed47a0b60b 100644
--- a/aom_mem/include/aom_mem_intrnl.h
+++ b/aom_mem/include/aom_mem_intrnl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/aarch32_cpudetect.c b/aom_ports/aarch32_cpudetect.c
index 809bae5920..af6774c5d2 100644
--- a/aom_ports/aarch32_cpudetect.c
+++ b/aom_ports/aarch32_cpudetect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c
index e356763901..47b4135fb3 100644
--- a/aom_ports/aarch64_cpudetect.c
+++ b/aom_ports/aarch64_cpudetect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/aom_once.h b/aom_ports/aom_once.h
index 680120feea..44081a1a2f 100644
--- a/aom_ports/aom_once.h
+++ b/aom_ports/aom_once.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/aom_timer.h b/aom_ports/aom_timer.h
index 642c5a08ba..a521af2038 100644
--- a/aom_ports/aom_timer.h
+++ b/aom_ports/aom_timer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/arm.h b/aom_ports/arm.h
index a57510895b..d1d47692d5 100644
--- a/aom_ports/arm.h
+++ b/aom_ports/arm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/arm_cpudetect.h b/aom_ports/arm_cpudetect.h
index 2b63942424..b8fcb9b7fb 100644
--- a/aom_ports/arm_cpudetect.h
+++ b/aom_ports/arm_cpudetect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 7db4cde90b..9a5d6684f6 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/emmintrin_compat.h b/aom_ports/emmintrin_compat.h
index 85d218a3d2..9fb55d7fac 100644
--- a/aom_ports/emmintrin_compat.h
+++ b/aom_ports/emmintrin_compat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index 77180068ae..fd33290330 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/mem_ops.h b/aom_ports/mem_ops.h
index 2b5bc0f0fb..4e32fd51a5 100644
--- a/aom_ports/mem_ops.h
+++ b/aom_ports/mem_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/mem_ops_aligned.h b/aom_ports/mem_ops_aligned.h
index 37c3675318..411133d4ef 100644
--- a/aom_ports/mem_ops_aligned.h
+++ b/aom_ports/mem_ops_aligned.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/ppc.h b/aom_ports/ppc.h
index 3159bda682..d6f50f6fd5 100644
--- a/aom_ports/ppc.h
+++ b/aom_ports/ppc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/ppc_cpudetect.c b/aom_ports/ppc_cpudetect.c
index ce4d5ae231..f1c3cd7e82 100644
--- a/aom_ports/ppc_cpudetect.c
+++ b/aom_ports/ppc_cpudetect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/sanitizer.h b/aom_ports/sanitizer.h
index 1dd8eb4cf4..f9819f6540 100644
--- a/aom_ports/sanitizer.h
+++ b/aom_ports/sanitizer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index c089984085..7a45b4e2c2 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/aom_scale.h b/aom_scale/aom_scale.h
index 11812a1453..4411397bb2 100644
--- a/aom_scale/aom_scale.h
+++ b/aom_scale/aom_scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/aom_scale_rtcd.c b/aom_scale/aom_scale_rtcd.c
index 93def357d8..9cec877c85 100644
--- a/aom_scale/aom_scale_rtcd.c
+++ b/aom_scale/aom_scale_rtcd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 206c42c9f5..85ae1e2c43 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
index 549e2aa690..6c8df70d96 100644
--- a/aom_scale/generic/gen_scalers.c
+++ b/aom_scale/generic/gen_scalers.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index ed35bb1acb..77b016ed0e 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index 384b72c21e..e40bba320e 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index bc05de2102..78fe1512e5 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/aom_pthread.h b/aom_util/aom_pthread.h
index 425a6b00f1..2021f13f5c 100644
--- a/aom_util/aom_pthread.h
+++ b/aom_util/aom_pthread.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c
index 783ffac32f..eabb48ca2a 100644
--- a/aom_util/aom_thread.c
+++ b/aom_util/aom_thread.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h
index 80ed314752..8d161a641d 100644
--- a/aom_util/aom_thread.h
+++ b/aom_util/aom_thread.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c
index d0792e34a4..86c4629915 100644
--- a/aom_util/debug_util.c
+++ b/aom_util/debug_util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/debug_util.h b/aom_util/debug_util.h
index 23cad2a5b9..6a2d318a7d 100644
--- a/aom_util/debug_util.h
+++ b/aom_util/debug_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/endian_inl.h b/aom_util/endian_inl.h
index b69102a7f5..17090cab01 100644
--- a/aom_util/endian_inl.h
+++ b/aom_util/endian_inl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/apps/aomdec.c b/apps/aomdec.c
index 15734cb6a9..144cacac7e 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 799fb3a4f8..555c6d8839 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/apps/aomenc.h b/apps/aomenc.h
index 935d5fcd16..ebbd79972e 100644
--- a/apps/aomenc.h
+++ b/apps/aomenc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 057565411a..d0f6814e5e 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index 73c78caec8..b7e8440f15 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 690d95927a..6f9125e6a5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/av1_cx_iface.h b/av1/av1_cx_iface.h
index b2a7005ea5..92baf20d2c 100644
--- a/av1/av1_cx_iface.h
+++ b/av1/av1_cx_iface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 18dc980f6f..77ec4c5da6 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index b923c3dcff..c1a2a5b252 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index e9a38c4a60..e8a565d535 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/alloccommon.h b/av1/common/alloccommon.h
index d31b4c56b6..ccbf757893 100644
--- a/av1/common/alloccommon.h
+++ b/av1/common/alloccommon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index 114232d50a..2ba7e86874 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
index 70ae88cf1f..a66512af96 100644
--- a/av1/common/arm/av1_convolve_scale_neon_dotprod.c
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
index fe94c84f3e..7970b3645a 100644
--- a/av1/common/arm/av1_convolve_scale_neon_i8mm.c
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 06168cd3f1..8188a06e17 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_inv_txfm_neon.h b/av1/common/arm/av1_inv_txfm_neon.h
index 97099c2042..1fb9066876 100644
--- a/av1/common/arm/av1_inv_txfm_neon.h
+++ b/av1/common/arm/av1_inv_txfm_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/av1_txfm_neon.c b/av1/common/arm/av1_txfm_neon.c
index f955a379f7..91367a133c 100644
--- a/av1/common/arm/av1_txfm_neon.c
+++ b/av1/common/arm/av1_txfm_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 7afb1a909d..2904bbce44 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 9aea29992a..1ef9e9e530 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 53d3a9f1e0..8320c71523 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 0871b4fe06..e872038d85 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/compound_convolve_neon.c b/av1/common/arm/compound_convolve_neon.c
index 6a596234dc..cae9b4bc8f 100644
--- a/av1/common/arm/compound_convolve_neon.c
+++ b/av1/common/arm/compound_convolve_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/compound_convolve_neon.h b/av1/common/arm/compound_convolve_neon.h
index d719680a32..c72e1680b2 100644
--- a/av1/common/arm/compound_convolve_neon.h
+++ b/av1/common/arm/compound_convolve_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c
index 40befdf44e..f7261d783f 100644
--- a/av1/common/arm/compound_convolve_neon_dotprod.c
+++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index a72af9e36a..9801ad8ce9 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 70cf23be06..35aa8122f4 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 5a9f8b6d39..b86d5739bf 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 32b056dc29..6e2a703065 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index cd989cb1da..8d7eb51772 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h
index 15a8a4e98c..ddd8364ea5 100644
--- a/av1/common/arm/convolve_neon_i8mm.h
+++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/convolve_scale_neon.h b/av1/common/arm/convolve_scale_neon.h
index 2253b54037..c164ff3854 100644
--- a/av1/common/arm/convolve_scale_neon.h
+++ b/av1/common/arm/convolve_scale_neon.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
index a274730548..832a3b4e5e 100644
--- a/av1/common/arm/convolve_sve2.c
+++ b/av1/common/arm/convolve_sve2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index 9247ded6bf..c6fc7642ce 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_compound_convolve_neon.h b/av1/common/arm/highbd_compound_convolve_neon.h
index c9344f3adf..ae5a43be01 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.h
+++ b/av1/common/arm/highbd_compound_convolve_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index 1d6c9b4faf..e5909219c3 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
index 4f1c25d122..d6a994b1c8 100644
--- a/av1/common/arm/highbd_convolve_horiz_rs_neon.c
+++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index 3a3e33fcba..2c392e381c 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_neon.h b/av1/common/arm/highbd_convolve_neon.h
index 08b2bda4e5..a32d63e022 100644
--- a/av1/common/arm/highbd_convolve_neon.h
+++ b/av1/common/arm/highbd_convolve_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
index 702c651536..a51848118a 100644
--- a/av1/common/arm/highbd_convolve_scale_neon.c
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index 6ce9f36d9a..5de87d8291 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_convolve_sve2.h b/av1/common/arm/highbd_convolve_sve2.h
index 05e23deef4..380607716f 100644
--- a/av1/common/arm/highbd_convolve_sve2.h
+++ b/av1/common/arm/highbd_convolve_sve2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index 70c09e7440..cfaa3e5ca7 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_reconinter_neon.c b/av1/common/arm/highbd_reconinter_neon.c
index da7f6c57d0..10f592f257 100644
--- a/av1/common/arm/highbd_reconinter_neon.c
+++ b/av1/common/arm/highbd_reconinter_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_reconintra_neon.c b/av1/common/arm/highbd_reconintra_neon.c
index 8fd4a9941f..5bfe61e431 100644
--- a/av1/common/arm/highbd_reconintra_neon.c
+++ b/av1/common/arm/highbd_reconintra_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_warp_plane_neon.c b/av1/common/arm/highbd_warp_plane_neon.c
index 51bf142fed..5bd19b0f30 100644
--- a/av1/common/arm/highbd_warp_plane_neon.c
+++ b/av1/common/arm/highbd_warp_plane_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_warp_plane_neon.h b/av1/common/arm/highbd_warp_plane_neon.h
index 2ec45d1e0d..b90213d2b2 100644
--- a/av1/common/arm/highbd_warp_plane_neon.h
+++ b/av1/common/arm/highbd_warp_plane_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_warp_plane_sve.c b/av1/common/arm/highbd_warp_plane_sve.c
index c2e1e995bd..9183fd027e 100644
--- a/av1/common/arm/highbd_warp_plane_sve.c
+++ b/av1/common/arm/highbd_warp_plane_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index a6bd6d38e4..b5b85fe693 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
index 2b0274cc64..a7f368948b 100644
--- a/av1/common/arm/reconinter_neon.c
+++ b/av1/common/arm/reconinter_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index d31c4a9443..a9b52d7788 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index a6d4b62964..ae0da3d018 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index 08e298f7f3..e613ecb98e 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index 546aa2965b..2604eaf27c 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
index eece007ef3..7fceb3aaf6 100644
--- a/av1/common/arm/warp_plane_neon.h
+++ b/av1/common/arm/warp_plane_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
index 22a1be17b5..9ccc863eb2 100644
--- a/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
index c70b066174..9d5761b05a 100644
--- a/av1/common/arm/warp_plane_sve.c
+++ b/av1/common/arm/warp_plane_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index 6440c16adb..0457f66c26 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 4e14c4a8be..857a2ea3db 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
index 8d69efcd2d..37c2091490 100644
--- a/av1/common/av1_inv_txfm1d.c
+++ b/av1/common/av1_inv_txfm1d.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
index e1d5d98d10..e1044d31d1 100644
--- a/av1/common/av1_inv_txfm1d.h
+++ b/av1/common/av1_inv_txfm1d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_inv_txfm1d_cfg.h b/av1/common/av1_inv_txfm1d_cfg.h
index b4f7801295..0f5b80aac9 100644
--- a/av1/common/av1_inv_txfm1d_cfg.h
+++ b/av1/common/av1_inv_txfm1d_cfg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index ee67dffe23..ef9521b2db 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 5af025c654..a6e3bb22c2 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index c9880cf5da..2ecc47c72b 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_rtcd.c b/av1/common/av1_rtcd.c
index 8a35dca369..21f1a81f40 100644
--- a/av1/common/av1_rtcd.c
+++ b/av1/common/av1_rtcd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index 011403b1fa..751851c2f0 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 7ad70af86a..f406109afd 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 1d597502ce..7e5f67dcda 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 0cfd1f3954..3b7f21e44f 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 5cec940a8e..1f50fc91e0 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index a56cd9db4a..b84f861b9d 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index ce7039f374..063d8d3941 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index b5e4f124ae..0ecff38608 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 5c62201f1e..58cfd3183e 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index bd11c4a6a0..652cb4d0f5 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index dbb94d665b..8f093b0d96 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/common.h b/av1/common/common.h
index ccb45b68ce..770cd76920 100644
--- a/av1/common/common.h
+++ b/av1/common/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/common_data.c b/av1/common/common_data.c
index 482aecfcc0..a49ae8abf7 100644
--- a/av1/common/common_data.c
+++ b/av1/common/common_data.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index dfe927c6ef..677c4a4ba0 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index bb72e0cbd2..3203e31c8a 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index d6dd8763c3..3d679a718a 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/debugmodes.c b/av1/common/debugmodes.c
index e67cf04a3f..d9a96721af 100644
--- a/av1/common/debugmodes.c
+++ b/av1/common/debugmodes.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/debugmodes.h b/av1/common/debugmodes.h
index 8f3a91cf46..d352dcd592 100644
--- a/av1/common/debugmodes.h
+++ b/av1/common/debugmodes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 97d95ea394..8c5d675e4c 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index 53ef3b1c89..f8332d4ce4 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 8381c1fdd0..e66cd2fc43 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 028bd21ae3..c688b21746 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index e1e42f2f18..f2f920fe5b 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index cddc80768c..bd4b7e9ece 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/enums.h b/av1/common/enums.h
index b99a138675..4abdd6f3ea 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 4344aea916..9814828f4d 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/frame_buffers.c b/av1/common/frame_buffers.c
index f10ccd5942..9db93fb5c9 100644
--- a/av1/common/frame_buffers.c
+++ b/av1/common/frame_buffers.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/frame_buffers.h b/av1/common/frame_buffers.h
index 16188e51c7..0c067d9377 100644
--- a/av1/common/frame_buffers.h
+++ b/av1/common/frame_buffers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/idct.c b/av1/common/idct.c
index d4b1c98831..bfd7d5c1c7 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 004d25d49a..799e38a383 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 6828834e05..d83d4fa60e 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index d8889f3eb3..d78b60124a 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index 3ab784c1ed..beaf55c8fd 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/obmc.h b/av1/common/obmc.h
index b84034541e..6a4595c852 100644
--- a/av1/common/obmc.h
+++ b/av1/common/obmc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/obu_util.c b/av1/common/obu_util.c
index cfca03bb4d..b68b804036 100644
--- a/av1/common/obu_util.c
+++ b/av1/common/obu_util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/obu_util.h b/av1/common/obu_util.h
index adf3568e15..759b53e28a 100644
--- a/av1/common/obu_util.h
+++ b/av1/common/obu_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c
index 27a7f07a0d..675d7f3859 100644
--- a/av1/common/ppc/cfl_ppc.c
+++ b/av1/common/ppc/cfl_ppc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/pred_common.c b/av1/common/pred_common.c
index 5952441d1f..ce440abfeb 100644
--- a/av1/common/pred_common.c
+++ b/av1/common/pred_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index 361a4078d4..7bc7cc0417 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index 58eb113370..dc033b139c 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index 8f36eb105b..347b02bcf5 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 602fab7237..d4d5f9848e 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index c31f4531e2..d7a4d12c07 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 497863e117..3fd1860011 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index fa66ccd541..80c15d03e0 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 505fccd43b..d3e3850a19 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 6b233f8259..25551a5fc3 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 335fdc8c2a..4e0dee54c2 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 644e06980f..1603984779 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/scale.c b/av1/common/scale.c
index d7c6a24378..773c4f05e3 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/scale.h b/av1/common/scale.h
index d8481bfc2c..bc8326c844 100644
--- a/av1/common/scale.h
+++ b/av1/common/scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/scan.c b/av1/common/scan.c
index 0943579db1..743cf08f62 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/scan.h b/av1/common/scan.h
index 4f369786f2..ee6375a2d4 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/seg_common.c b/av1/common/seg_common.c
index 60b185161c..12f1598153 100644
--- a/av1/common/seg_common.c
+++ b/av1/common/seg_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/seg_common.h b/av1/common/seg_common.h
index 44b508b146..1ad1ae2213 100644
--- a/av1/common/seg_common.h
+++ b/av1/common/seg_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 8a137cc9f7..6b12bb7708 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 7e681f322b..6a7ff14aae 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 45a189d69a..a8f2f0231e 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 12228c9e94..89f290f2d9 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/timing.c b/av1/common/timing.c
index a959cdf768..a98af817ee 100644
--- a/av1/common/timing.c
+++ b/av1/common/timing.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/timing.h b/av1/common/timing.h
index 9192124f72..8a56fa2406 100644
--- a/av1/common/timing.h
+++ b/av1/common/timing.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/token_cdfs.h b/av1/common/token_cdfs.h
index f1edda58d7..7aa49819d8 100644
--- a/av1/common/token_cdfs.h
+++ b/av1/common/token_cdfs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/txb_common.c b/av1/common/txb_common.c
index bf2bc36b04..a6c2c33edb 100644
--- a/av1/common/txb_common.c
+++ b/av1/common/txb_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index 9628090b63..e1f6104eb2 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 4282b92bfa..4e41cc4448 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index d772df8873..00ede2afa7 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
index 8aa14696f6..4d98a4fb23 100644
--- a/av1/common/x86/av1_convolve_horiz_rs_sse4.c
+++ b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 8e293b5bb1..1f33ccaea8 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_inv_txfm_avx2.c b/av1/common/x86/av1_inv_txfm_avx2.c
index 0afd42b170..0639fb481f 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/av1/common/x86/av1_inv_txfm_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_inv_txfm_avx2.h b/av1/common/x86/av1_inv_txfm_avx2.h
index a09dea389f..6f02149a4f 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/av1/common/x86/av1_inv_txfm_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index 79a6064c3e..ee40e3586c 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.h b/av1/common/x86/av1_inv_txfm_ssse3.h
index 1873d01bc0..b6ffc392d5 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index 129721cf05..8e4cba502c 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_txfm_sse4.c b/av1/common/x86/av1_txfm_sse4.c
index 1894efdc10..0de445f93d 100644
--- a/av1/common/x86/av1_txfm_sse4.c
+++ b/av1/common/x86/av1_txfm_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/av1_txfm_sse4.h b/av1/common/x86/av1_txfm_sse4.h
index 387dfd6bb3..bee03b40b5 100644
--- a/av1/common/x86/av1_txfm_sse4.h
+++ b/av1/common/x86/av1_txfm_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cdef_block_avx2.c b/av1/common/x86/cdef_block_avx2.c
index 1ec4b6c332..7a2aa11e21 100644
--- a/av1/common/x86/cdef_block_avx2.c
+++ b/av1/common/x86/cdef_block_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cdef_block_sse4.c b/av1/common/x86/cdef_block_sse4.c
index 344c1e47c9..10ec1a6408 100644
--- a/av1/common/x86/cdef_block_sse4.c
+++ b/av1/common/x86/cdef_block_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cdef_block_ssse3.c b/av1/common/x86/cdef_block_ssse3.c
index 14eb6c9e31..7af8c45047 100644
--- a/av1/common/x86/cdef_block_ssse3.c
+++ b/av1/common/x86/cdef_block_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index e1e187c4a6..b2b6cae933 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index 03ae02a922..9f46ab5781 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cfl_sse2.c b/av1/common/x86/cfl_sse2.c
index 4783fe098c..d5e90aba01 100644
--- a/av1/common/x86/cfl_sse2.c
+++ b/av1/common/x86/cfl_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 476b6609a9..1339c24498 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index d4c1169cc3..d23645ce94 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 68971eacc1..187bfb3418 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 3862bbeac1..48bb8306cb 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 9272e91b54..c64f7259ab 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
index d05bb0e15f..bb247499d0 100644
--- a/av1/common/x86/filterintra_sse4.c
+++ b/av1/common/x86/filterintra_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index d65318ccfa..c15f13cdda 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c
index 89d7199f48..2e9dda9b45 100644
--- a/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c
index 88974ba260..5a6f3da294 100644
--- a/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 73e6911d0b..61a0e01524 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index c12022cff8..ac959420c0 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index 6dcac10e45..80747ea45e 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c
index 5a7fc536a2..9d09783cad 100644
--- a/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h
index 5734810f52..54a35a9f62 100644
--- a/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
index 75108b49da..3708ab827f 100644
--- a/av1/common/x86/highbd_warp_affine_avx2.c
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 96fb4cf632..5f7bf04675 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_wiener_convolve_avx2.c b/av1/common/x86/highbd_wiener_convolve_avx2.c
index 562c623fa9..f7b04a6878 100644
--- a/av1/common/x86/highbd_wiener_convolve_avx2.c
+++ b/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/highbd_wiener_convolve_ssse3.c b/av1/common/x86/highbd_wiener_convolve_ssse3.c
index cab37fa910..dea82dd2c3 100644
--- a/av1/common/x86/highbd_wiener_convolve_ssse3.c
+++ b/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index 3eee46faeb..071657fae8 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 9f82ed2300..50df3371fc 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 6b1227890a..9dfc8f4a97 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/jnt_convolve_ssse3.c b/av1/common/x86/jnt_convolve_ssse3.c
index f6bf67815d..bb6858ab25 100644
--- a/av1/common/x86/jnt_convolve_ssse3.c
+++ b/av1/common/x86/jnt_convolve_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c
index 4bc5aa41c3..9c2ee80bf2 100644
--- a/av1/common/x86/reconinter_avx2.c
+++ b/av1/common/x86/reconinter_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index eb4a4d1da3..f343064a7c 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/reconinter_ssse3.c b/av1/common/x86/reconinter_ssse3.c
index b177958b83..3a8f6c0ef4 100644
--- a/av1/common/x86/reconinter_ssse3.c
+++ b/av1/common/x86/reconinter_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 9c8958ed42..c7d4feff01 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index e2d84daaf4..81fe0f6ab0 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index a7fdb5a9a4..d23d3dc89d 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 5ab6c46f8a..4d910c7022 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index ac850f5691..2f88ca56a9 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index 663b8cde93..d14e175968 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 4c05555ff7..cc7cdc416c 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/wiener_convolve_avx2.c b/av1/common/x86/wiener_convolve_avx2.c
index 3de630f203..8f23693e22 100644
--- a/av1/common/x86/wiener_convolve_avx2.c
+++ b/av1/common/x86/wiener_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/x86/wiener_convolve_sse2.c b/av1/common/x86/wiener_convolve_sse2.c
index 1c039e80c6..e5cc9e038c 100644
--- a/av1/common/x86/wiener_convolve_sse2.c
+++ b/av1/common/x86/wiener_convolve_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/accounting.c b/av1/decoder/accounting.c
index 1ded380ec3..6caa459e38 100644
--- a/av1/decoder/accounting.c
+++ b/av1/decoder/accounting.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/accounting.h b/av1/decoder/accounting.h
index ad2e8b6cfe..ede0dcf1eb 100644
--- a/av1/decoder/accounting.h
+++ b/av1/decoder/accounting.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d25651229f..759a95c21d 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodeframe.h b/av1/decoder/decodeframe.h
index 46ae475ff5..bb5c031b6e 100644
--- a/av1/decoder/decodeframe.h
+++ b/av1/decoder/decodeframe.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index bb0ccf5fd8..41f07566a5 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodemv.h b/av1/decoder/decodemv.h
index 7e77c030f8..2b94f5f0e1 100644
--- a/av1/decoder/decodemv.h
+++ b/av1/decoder/decodemv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index a886ed469c..9edd34abd6 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 560b1d9f24..a584753223 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index dd5aa62001..fbaeb8cc6e 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/decodetxb.h b/av1/decoder/decodetxb.h
index fd34d40341..5c70b21a9a 100644
--- a/av1/decoder/decodetxb.h
+++ b/av1/decoder/decodetxb.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 3c6a006eaf..df67864baa 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/detokenize.h b/av1/decoder/detokenize.h
index 173b437a94..5134626464 100644
--- a/av1/decoder/detokenize.h
+++ b/av1/decoder/detokenize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/dthread.h b/av1/decoder/dthread.h
index b0f6fda829..d09f714267 100644
--- a/av1/decoder/dthread.h
+++ b/av1/decoder/dthread.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/grain_synthesis.c b/av1/decoder/grain_synthesis.c
index d276f6f90e..e18bdfb82f 100644
--- a/av1/decoder/grain_synthesis.c
+++ b/av1/decoder/grain_synthesis.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/grain_synthesis.h b/av1/decoder/grain_synthesis.h
index 9858ce0013..c419fe0c0e 100644
--- a/av1/decoder/grain_synthesis.h
+++ b/av1/decoder/grain_synthesis.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/inspection.c b/av1/decoder/inspection.c
index 288d69a224..f84ed17d20 100644
--- a/av1/decoder/inspection.c
+++ b/av1/decoder/inspection.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/inspection.h b/av1/decoder/inspection.h
index 70b1c80fab..7d1c14db2a 100644
--- a/av1/decoder/inspection.h
+++ b/av1/decoder/inspection.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c
index e0b2d87c32..fb1b0e8156 100644
--- a/av1/decoder/obu.c
+++ b/av1/decoder/obu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/decoder/obu.h b/av1/decoder/obu.h
index d8ebe368e6..eb80e3a2df 100644
--- a/av1/decoder/obu.h
+++ b/av1/decoder/obu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 87becb80ef..d33bec2d31 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
index 0d34ce0841..eff91e9dd3 100644
--- a/av1/encoder/allintra_vis.h
+++ b/av1/encoder/allintra_vis.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 4cf6bd572d..090463bf81 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_complexity.h b/av1/encoder/aq_complexity.h
index 3421d74c93..6a72ef7083 100644
--- a/av1/encoder/aq_complexity.h
+++ b/av1/encoder/aq_complexity.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 2ef6cba698..e9fd771071 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index 10974f018b..e2e24fab30 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index 086928a118..66e02c00ee 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/aq_variance.h b/av1/encoder/aq_variance.h
index aa0535ad72..8d07cd95ba 100644
--- a/av1/encoder/aq_variance.h
+++ b/av1/encoder/aq_variance.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/av1_error_neon.c b/av1/encoder/arm/av1_error_neon.c
index 8311546a89..fe44f20f57 100644
--- a/av1/encoder/arm/av1_error_neon.c
+++ b/av1/encoder/arm/av1_error_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/av1_error_sve.c b/av1/encoder/arm/av1_error_sve.c
index 60d368c1b5..dee27e0da7 100644
--- a/av1/encoder/arm/av1_error_sve.c
+++ b/av1/encoder/arm/av1_error_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/av1_fwd_txfm2d_neon.c
index 5148ee74a9..ded7387601 100644
--- a/av1/encoder/arm/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/av1_fwd_txfm2d_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
index 11d3def16b..6710f84023 100644
--- a/av1/encoder/arm/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/av1_highbd_quantize_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/av1_k_means_neon.c b/av1/encoder/arm/av1_k_means_neon.c
index 586376970f..eab9587b4d 100644
--- a/av1/encoder/arm/av1_k_means_neon.c
+++ b/av1/encoder/arm/av1_k_means_neon.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <arm_neon.h>
diff --git a/av1/encoder/arm/av1_temporal_denoiser_neon.c b/av1/encoder/arm/av1_temporal_denoiser_neon.c
index 18cd0ce4c0..a3f60a4442 100644
--- a/av1/encoder/arm/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/av1_temporal_denoiser_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/cnn_neon.c b/av1/encoder/arm/cnn_neon.c
index 8e686260d0..041d86525b 100644
--- a/av1/encoder/arm/cnn_neon.c
+++ b/av1/encoder/arm/cnn_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/encodetxb_neon.c b/av1/encoder/arm/encodetxb_neon.c
index 582863a27c..29ca087ede 100644
--- a/av1/encoder/arm/encodetxb_neon.c
+++ b/av1/encoder/arm/encodetxb_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/hash_arm_crc32.c b/av1/encoder/arm/hash_arm_crc32.c
index 6417839ede..9f8d268020 100644
--- a/av1/encoder/arm/hash_arm_crc32.c
+++ b/av1/encoder/arm/hash_arm_crc32.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/highbd_fwd_txfm_neon.c b/av1/encoder/arm/highbd_fwd_txfm_neon.c
index aa64a38902..f1b060510d 100644
--- a/av1/encoder/arm/highbd_fwd_txfm_neon.c
+++ b/av1/encoder/arm/highbd_fwd_txfm_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index d067a7616a..60beca2dc0 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/highbd_pickrst_sve.c b/av1/encoder/arm/highbd_pickrst_sve.c
index 4f804c9052..fc2c24d917 100644
--- a/av1/encoder/arm/highbd_pickrst_sve.c
+++ b/av1/encoder/arm/highbd_pickrst_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/highbd_rdopt_neon.c b/av1/encoder/arm/highbd_rdopt_neon.c
index 4bf7ae6ce4..983a81b12e 100644
--- a/av1/encoder/arm/highbd_rdopt_neon.c
+++ b/av1/encoder/arm/highbd_rdopt_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c
index 88e176f56c..2b2b189a48 100644
--- a/av1/encoder/arm/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/highbd_temporal_filter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
index 1d83bec168..65fb2deb3d 100644
--- a/av1/encoder/arm/hybrid_fwd_txfm_neon.c
+++ b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/ml_neon.c b/av1/encoder/arm/ml_neon.c
index be6ddfd763..142e465e28 100644
--- a/av1/encoder/arm/ml_neon.c
+++ b/av1/encoder/arm/ml_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index 85b980c2f0..015378ac98 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h
index f9683840e1..fd6fedb4e9 100644
--- a/av1/encoder/arm/pickrst_neon.h
+++ b/av1/encoder/arm/pickrst_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/pickrst_sve.c b/av1/encoder/arm/pickrst_sve.c
index 5d7370b5da..ed3cb5223d 100644
--- a/av1/encoder/arm/pickrst_sve.c
+++ b/av1/encoder/arm/pickrst_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h
index 97f08fc61e..5d629ee369 100644
--- a/av1/encoder/arm/pickrst_sve.h
+++ b/av1/encoder/arm/pickrst_sve.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index c3b57ce206..a9194a9b59 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/rdopt_neon.c b/av1/encoder/arm/rdopt_neon.c
index 7d3bd4c606..e96c7ba23c 100644
--- a/av1/encoder/arm/rdopt_neon.c
+++ b/av1/encoder/arm/rdopt_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/reconinter_enc_neon.c b/av1/encoder/arm/reconinter_enc_neon.c
index 3d17723224..4ebb34cb08 100644
--- a/av1/encoder/arm/reconinter_enc_neon.c
+++ b/av1/encoder/arm/reconinter_enc_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/shift_neon.h b/av1/encoder/arm/shift_neon.h
index ad9fd9c671..a6fdbebc17 100644
--- a/av1/encoder/arm/shift_neon.h
+++ b/av1/encoder/arm/shift_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 986f143864..103324fbe5 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index 919521fec7..5fadeb9dc8 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/txfm_neon.h b/av1/encoder/arm/txfm_neon.h
index 8b07dfb613..49ff87dc24 100644
--- a/av1/encoder/arm/txfm_neon.h
+++ b/av1/encoder/arm/txfm_neon.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/wedge_utils_neon.c b/av1/encoder/arm/wedge_utils_neon.c
index 1b35269b33..4d2d12f7ff 100644
--- a/av1/encoder/arm/wedge_utils_neon.c
+++ b/av1/encoder/arm/wedge_utils_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/arm/wedge_utils_sve.c b/av1/encoder/arm/wedge_utils_sve.c
index 521601a3f3..8d44b24314 100644
--- a/av1/encoder/arm/wedge_utils_sve.c
+++ b/av1/encoder/arm/wedge_utils_sve.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_fwd_txfm1d.c b/av1/encoder/av1_fwd_txfm1d.c
index 6601c19ab3..c33e9f7e71 100644
--- a/av1/encoder/av1_fwd_txfm1d.c
+++ b/av1/encoder/av1_fwd_txfm1d.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_fwd_txfm1d.h b/av1/encoder/av1_fwd_txfm1d.h
index 9ef54fe4de..4d4aa32169 100644
--- a/av1/encoder/av1_fwd_txfm1d.h
+++ b/av1/encoder/av1_fwd_txfm1d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_fwd_txfm1d_cfg.h b/av1/encoder/av1_fwd_txfm1d_cfg.h
index 2777cc25bc..99a3cdacfa 100644
--- a/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index 12a9535a7c..8012d771ca 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_ml_partition_models.h b/av1/encoder/av1_ml_partition_models.h
index 2572b138d5..a469e8f532 100644
--- a/av1/encoder/av1_ml_partition_models.h
+++ b/av1/encoder/av1_ml_partition_models.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index 25007bb6d4..ca4c768d4e 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_noise_estimate.h b/av1/encoder/av1_noise_estimate.h
index 85530666f6..1dc49e9e06 100644
--- a/av1/encoder/av1_noise_estimate.h
+++ b/av1/encoder/av1_noise_estimate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 110d17f434..382d07c5b0 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index 040973376d..3dea441a30 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_temporal_denoiser.c b/av1/encoder/av1_temporal_denoiser.c
index d4a1625612..5604586b21 100644
--- a/av1/encoder/av1_temporal_denoiser.c
+++ b/av1/encoder/av1_temporal_denoiser.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/av1_temporal_denoiser.h b/av1/encoder/av1_temporal_denoiser.h
index 14dcccce69..b3d2e4f31e 100644
--- a/av1/encoder/av1_temporal_denoiser.h
+++ b/av1/encoder/av1_temporal_denoiser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 163b62c77c..e485faa73e 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index d037039593..f0b0fd0acb 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 9bee0b8d02..01e012a0f2 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/blockiness.c b/av1/encoder/blockiness.c
index 6ad2ddaf25..8c93df38d1 100644
--- a/av1/encoder/blockiness.c
+++ b/av1/encoder/blockiness.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c
index b019ace685..6593597470 100644
--- a/av1/encoder/cnn.c
+++ b/av1/encoder/cnn.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/cnn.h b/av1/encoder/cnn.h
index df6401f73f..2c61907c2c 100644
--- a/av1/encoder/cnn.h
+++ b/av1/encoder/cnn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 3b0ee88241..99c3c6513d 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/compound_type.h b/av1/encoder/compound_type.h
index a028a35093..8afc33454a 100644
--- a/av1/encoder/compound_type.h
+++ b/av1/encoder/compound_type.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index aafe55d2d0..4273fae59a 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index 0be7ccbb54..83e5b59fad 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/cost.c b/av1/encoder/cost.c
index 323e2aed58..e4d15e250b 100644
--- a/av1/encoder/cost.c
+++ b/av1/encoder/cost.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/cost.h b/av1/encoder/cost.h
index be0241a820..0333fd4630 100644
--- a/av1/encoder/cost.h
+++ b/av1/encoder/cost.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/dwt.c b/av1/encoder/dwt.c
index 2fab99dd8b..84b3b7515a 100644
--- a/av1/encoder/dwt.c
+++ b/av1/encoder/dwt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/dwt.h b/av1/encoder/dwt.h
index 443b6bc12c..8ba6c02889 100644
--- a/av1/encoder/dwt.h
+++ b/av1/encoder/dwt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/enc_enums.h b/av1/encoder/enc_enums.h
index 0a8b0f258a..98072b3179 100644
--- a/av1/encoder/enc_enums.h
+++ b/av1/encoder/enc_enums.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index db77dc0e3c..f9dd15ddab 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index c1d14d134c..2b72edcefa 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 10fc611148..788ac80826 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodeframe.h b/av1/encoder/encodeframe.h
index ce32fb47e6..077265d8cd 100644
--- a/av1/encoder/encodeframe.h
+++ b/av1/encoder/encodeframe.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index a8e4a88396..6e268e6b63 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 14c71b8802..6f084eb938 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index c78761dd98..c53e8b94d7 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index f97bf8f517..721210a206 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 7cae72c159..7af4097cc5 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index c39001a5a2..f37cd5d13f 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 093eabc075..c4c333f624 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b0fc5cd78a..5966da7381 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index f24d4b0a10..23e07db755 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 1c04df7e0c..2a549f8944 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 113f62aa59..61c4b0a122 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 701c5489fe..a6452a9bec 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 67b94046b4..b9c2031027 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 356aa03275..9b4b178c90 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 138811c8a3..8b89ebc11a 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/extend.c b/av1/encoder/extend.c
index e1b1e69ca7..c837d2f199 100644
--- a/av1/encoder/extend.c
+++ b/av1/encoder/extend.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/extend.h b/av1/encoder/extend.h
index b8cc5b9d28..a9111981c2 100644
--- a/av1/encoder/extend.h
+++ b/av1/encoder/extend.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/external_partition.c b/av1/encoder/external_partition.c
index 79f8b4c8a4..d72eab0883 100644
--- a/av1/encoder/external_partition.c
+++ b/av1/encoder/external_partition.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/external_partition.h b/av1/encoder/external_partition.h
index f74973e9eb..bd299fc5f3 100644
--- a/av1/encoder/external_partition.h
+++ b/av1/encoder/external_partition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index b94a50714a..9d151cb254 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index d01363a80e..75ea618d24 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 0ae47809c6..7a98c8c183 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index 2645f93e3c..bb82d798d6 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 687eeee18a..d085a35a3b 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/global_motion_facade.h b/av1/encoder/global_motion_facade.h
index f13989aa25..02dd0f5250 100644
--- a/av1/encoder/global_motion_facade.h
+++ b/av1/encoder/global_motion_facade.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index 5078098450..344c990005 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/gop_structure.h b/av1/encoder/gop_structure.h
index ff22f54136..4ef43a9d55 100644
--- a/av1/encoder/gop_structure.h
+++ b/av1/encoder/gop_structure.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/grain_test_vectors.h b/av1/encoder/grain_test_vectors.h
index 945dc37331..0060d41e6d 100644
--- a/av1/encoder/grain_test_vectors.h
+++ b/av1/encoder/grain_test_vectors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 8037b59bef..7e3384e317 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h
index d8e8cc3a0b..4cc273a519 100644
--- a/av1/encoder/hash.h
+++ b/av1/encoder/hash.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hash_motion.c b/av1/encoder/hash_motion.c
index 8b04e22d6c..78403a6e96 100644
--- a/av1/encoder/hash_motion.c
+++ b/av1/encoder/hash_motion.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hash_motion.h b/av1/encoder/hash_motion.h
index 8974ba27cb..f78b0f679a 100644
--- a/av1/encoder/hash_motion.h
+++ b/av1/encoder/hash_motion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index a108e8148c..54f2c7f97e 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/hybrid_fwd_txfm.h b/av1/encoder/hybrid_fwd_txfm.h
index 30f8a2258b..0a9081bc4c 100644
--- a/av1/encoder/hybrid_fwd_txfm.h
+++ b/av1/encoder/hybrid_fwd_txfm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 27235303c0..d64cae45dc 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index 9815e0bcfb..d7df92f0d3 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 99b0af2f8e..c4f914f4b6 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index 75289c4e3c..c040797940 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 107c2236f8..6dc9826e95 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 239029345d..4199159c89 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/level.c b/av1/encoder/level.c
index 5d5fe9ce96..7d35fcc3bd 100644
--- a/av1/encoder/level.c
+++ b/av1/encoder/level.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/level.h b/av1/encoder/level.h
index ebf2a1c19d..9c1dadd211 100644
--- a/av1/encoder/level.h
+++ b/av1/encoder/level.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c
index 476c91ab95..1011799e6c 100644
--- a/av1/encoder/lookahead.c
+++ b/av1/encoder/lookahead.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index 41eca87fa3..0a33120c72 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index f3a9828cb3..cf44db760c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 87b9309b61..d6dc8cba29 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mcomp_structs.h b/av1/encoder/mcomp_structs.h
index 06660cf4a6..2fa28174a1 100644
--- a/av1/encoder/mcomp_structs.h
+++ b/av1/encoder/mcomp_structs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/misc_model_weights.h b/av1/encoder/misc_model_weights.h
index f00aeabcf6..97ed485bd6 100644
--- a/av1/encoder/misc_model_weights.h
+++ b/av1/encoder/misc_model_weights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c
index 94cd56c5d1..7953ac0476 100644
--- a/av1/encoder/ml.c
+++ b/av1/encoder/ml.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h
index 566f9271dd..0e1b579d20 100644
--- a/av1/encoder/ml.h
+++ b/av1/encoder/ml.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mode_prune_model_weights.h b/av1/encoder/mode_prune_model_weights.h
index 98ec36808a..f64c81b305 100644
--- a/av1/encoder/mode_prune_model_weights.h
+++ b/av1/encoder/mode_prune_model_weights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index f7e8b96b5b..d96e5ec129 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index e7eec29dc3..e36f7e8e9c 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index d1fa915bca..a4fe262474 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index b64f4dcd0e..e9aeb07785 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/mv_prec.h b/av1/encoder/mv_prec.h
index 55108b6cdb..2d022d59f3 100644
--- a/av1/encoder/mv_prec.h
+++ b/av1/encoder/mv_prec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/nonrd_opt.c b/av1/encoder/nonrd_opt.c
index e3589dad6b..bcda2f0799 100644
--- a/av1/encoder/nonrd_opt.c
+++ b/av1/encoder/nonrd_opt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index a53578ebad..eae0be059b 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 317d5c7e66..7fdc546b2c 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/optical_flow.c b/av1/encoder/optical_flow.c
index dc168e7aee..015d07d614 100644
--- a/av1/encoder/optical_flow.c
+++ b/av1/encoder/optical_flow.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/optical_flow.h b/av1/encoder/optical_flow.h
index 2fbe474d77..71c8ac8786 100644
--- a/av1/encoder/optical_flow.h
+++ b/av1/encoder/optical_flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 6ae1c6cf63..1bc706717d 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index 30886d37ae..b4a59ff24f 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_cnn_weights.h b/av1/encoder/partition_cnn_weights.h
index 504038c63a..3470bcd032 100644
--- a/av1/encoder/partition_cnn_weights.h
+++ b/av1/encoder/partition_cnn_weights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_model_weights.h b/av1/encoder/partition_model_weights.h
index 71c1ace782..96dc21693b 100644
--- a/av1/encoder/partition_model_weights.h
+++ b/av1/encoder/partition_model_weights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 30ea7d9140..d31780fde2 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 1b5d71b7da..5d09fd41c3 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 1d62f128c7..c6ae4fa473 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index 84683f5fd4..a7b1465b4e 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index eca49c0621..619b2de81a 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index 5987a78a23..f923d930ea 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index ed5fa55f17..5355ee8661 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index 192e734fb0..5a1ec2157a 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index ce0357163d..abcbb7d481 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/picklpf.h b/av1/encoder/picklpf.h
index f567937c32..36f35bdc3b 100644
--- a/av1/encoder/picklpf.h
+++ b/av1/encoder/picklpf.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index f60499418b..d35848fa59 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index d1d0b0cec6..c4cad30882 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/pustats.h b/av1/encoder/pustats.h
index 2e8710108b..54fed015e2 100644
--- a/av1/encoder/pustats.h
+++ b/av1/encoder/pustats.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/random.h b/av1/encoder/random.h
index efe909b6db..9a34f1f7a7 100644
--- a/av1/encoder/random.h
+++ b/av1/encoder/random.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 8060a8ba2f..ec9ae10569 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 5121a909f4..5fcb65e071 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rc_utils.h b/av1/encoder/rc_utils.h
index fe22ee5afb..35c98f4006 100644
--- a/av1/encoder/rc_utils.h
+++ b/av1/encoder/rc_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index c2d76e7a9a..8a719160fb 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index b38d9ca542..004f65353d 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index c17fbccf8c..5954bdb8e3 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index efb797e5b5..a1fa7075ef 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rdopt_data_defs.h b/av1/encoder/rdopt_data_defs.h
index ca7ef810f3..7b295db815 100644
--- a/av1/encoder/rdopt_data_defs.h
+++ b/av1/encoder/rdopt_data_defs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index b6bc4927e3..349a41af6c 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 9b964113a5..1cbe0f9c11 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/reconinter_enc.h b/av1/encoder/reconinter_enc.h
index 16932f37a0..af325dc3dd 100644
--- a/av1/encoder/reconinter_enc.h
+++ b/av1/encoder/reconinter_enc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/saliency_map.c b/av1/encoder/saliency_map.c
index 30019bbec0..90672ba035 100644
--- a/av1/encoder/saliency_map.c
+++ b/av1/encoder/saliency_map.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/saliency_map.h b/av1/encoder/saliency_map.h
index 0d27f83633..07cb92991e 100644
--- a/av1/encoder/saliency_map.h
+++ b/av1/encoder/saliency_map.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index 4b4e78779c..82d487784f 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/segmentation.h b/av1/encoder/segmentation.h
index 1ad13d66a9..0d7a115ff8 100644
--- a/av1/encoder/segmentation.h
+++ b/av1/encoder/segmentation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/sorting_network.h b/av1/encoder/sorting_network.h
index 54f4c19dcd..2705aab91e 100644
--- a/av1/encoder/sorting_network.h
+++ b/av1/encoder/sorting_network.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/sparse_linear_solver.c b/av1/encoder/sparse_linear_solver.c
index e47c78e148..90d8e08ecf 100644
--- a/av1/encoder/sparse_linear_solver.c
+++ b/av1/encoder/sparse_linear_solver.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/sparse_linear_solver.h b/av1/encoder/sparse_linear_solver.h
index f30fc0f5b1..86b7b3dbb2 100644
--- a/av1/encoder/sparse_linear_solver.h
+++ b/av1/encoder/sparse_linear_solver.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 893749c88b..31fe03aebe 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index c768ff3944..e81891447b 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c
index 41225d55ae..8b96357ccc 100644
--- a/av1/encoder/superres_scale.c
+++ b/av1/encoder/superres_scale.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/superres_scale.h b/av1/encoder/superres_scale.h
index 450a4ed902..20f749a29a 100644
--- a/av1/encoder/superres_scale.h
+++ b/av1/encoder/superres_scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index dbab1d54c9..b0df3f447e 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <assert.h>
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index d56ea77791..cbe4304a12 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index e8cc145030..2458f25843 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index a40fb039b9..2040045ded 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/thirdpass.c b/av1/encoder/thirdpass.c
index a25522fbc5..3ac063676d 100644
--- a/av1/encoder/thirdpass.c
+++ b/av1/encoder/thirdpass.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/thirdpass.h b/av1/encoder/thirdpass.h
index 8080c06cb6..14950f5d9b 100644
--- a/av1/encoder/thirdpass.h
+++ b/av1/encoder/thirdpass.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index ffac886e32..04094789a7 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index f675c489ae..d3795c63bc 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 86f5485a26..9faf2eaddf 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 0150c702f9..8f08702eb2 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c
index 4381af6a8b..99fd464a91 100644
--- a/av1/encoder/tune_butteraugli.c
+++ b/av1/encoder/tune_butteraugli.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tune_butteraugli.h b/av1/encoder/tune_butteraugli.h
index bae5d2a882..e7c6d5cd41 100644
--- a/av1/encoder/tune_butteraugli.h
+++ b/av1/encoder/tune_butteraugli.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index fdb7c77ebc..9b03becc11 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tune_vmaf.h b/av1/encoder/tune_vmaf.h
index 404fd1029a..7e01435e4e 100644
--- a/av1/encoder/tune_vmaf.h
+++ b/av1/encoder/tune_vmaf.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tx_prune_model_weights.h b/av1/encoder/tx_prune_model_weights.h
index aab5e1398d..2fd937af13 100644
--- a/av1/encoder/tx_prune_model_weights.h
+++ b/av1/encoder/tx_prune_model_weights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 5dcc08c0ff..f4aab493ea 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index ed95c1cd98..6b826a5d70 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index e551e8aa12..801da94ca7 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/txb_rdopt.h b/av1/encoder/txb_rdopt.h
index 70b322a2e1..3f14a6cd42 100644
--- a/av1/encoder/txb_rdopt.h
+++ b/av1/encoder/txb_rdopt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/txb_rdopt_utils.h b/av1/encoder/txb_rdopt_utils.h
index b9f08aacf0..56245f503f 100644
--- a/av1/encoder/txb_rdopt_utils.h
+++ b/av1/encoder/txb_rdopt_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 2c9772dddb..e5908f41c2 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
index f912458307..56786872e2 100644
--- a/av1/encoder/var_based_part.h
+++ b/av1/encoder/var_based_part.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/wedge_utils.c b/av1/encoder/wedge_utils.c
index 40670178d7..edce2187a2 100644
--- a/av1/encoder/wedge_utils.c
+++ b/av1/encoder/wedge_utils.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index 494b0fdf15..3a5edf3ce7 100644
--- a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index b143df3523..b217a85447 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index 825da8d7b4..cae2fc7316 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm_avx2.h b/av1/encoder/x86/av1_fwd_txfm_avx2.h
index aaad76e5ae..56647a090e 100644
--- a/av1/encoder/x86/av1_fwd_txfm_avx2.h
+++ b/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index 31cc37db7a..f6733a338d 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.h b/av1/encoder/x86/av1_fwd_txfm_sse2.h
index 3cb869a8fe..68a6a09029 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c
index b58911fcb2..d43e4a7242 100644
--- a/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index f3a0b15de5..7873a8f64a 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
index 52ddc66437..98d8435478 100644
--- a/av1/encoder/x86/av1_k_means_avx2.c
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c
index 6c75822350..eb4c98e9f0 100644
--- a/av1/encoder/x86/av1_k_means_sse2.c
+++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 75c5172f85..19e8694ab7 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index b533894015..a933db9270 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/av1/encoder/x86/av1_temporal_denoiser_sse2.c
index 830f40ecb0..daf63a126e 100644
--- a/av1/encoder/x86/av1_temporal_denoiser_sse2.c
+++ b/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_txfm1d_sse4.h b/av1/encoder/x86/av1_txfm1d_sse4.h
index 7a0f32898b..22638cd067 100644
--- a/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c
index 9c26a56641..59f6116e0f 100644
--- a/av1/encoder/x86/cnn_avx2.c
+++ b/av1/encoder/x86/cnn_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/encodetxb_avx2.c b/av1/encoder/x86/encodetxb_avx2.c
index 9627f75930..cee9f121af 100644
--- a/av1/encoder/x86/encodetxb_avx2.c
+++ b/av1/encoder/x86/encodetxb_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/encodetxb_sse2.c b/av1/encoder/x86/encodetxb_sse2.c
index d23a688747..607aaa66d2 100644
--- a/av1/encoder/x86/encodetxb_sse2.c
+++ b/av1/encoder/x86/encodetxb_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/encodetxb_sse4.c b/av1/encoder/x86/encodetxb_sse4.c
index 72bd8e3411..081e8265b2 100644
--- a/av1/encoder/x86/encodetxb_sse4.c
+++ b/av1/encoder/x86/encodetxb_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index f180c94f4e..7bc4ee1bae 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/error_intrin_sse2.c b/av1/encoder/x86/error_intrin_sse2.c
index 61f65c623f..9aa58e9ab9 100644
--- a/av1/encoder/x86/error_intrin_sse2.c
+++ b/av1/encoder/x86/error_intrin_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
index ebe75310e9..172cbdb6a7 100644
--- a/av1/encoder/x86/hash_sse42.c
+++ b/av1/encoder/x86/hash_sse42.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/av1/encoder/x86/highbd_block_error_intrin_avx2.c
index 340307cb3e..af4ca0882b 100644
--- a/av1/encoder/x86/highbd_block_error_intrin_avx2.c
+++ b/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
index b0b2757568..502557bf67 100644
--- a/av1/encoder/x86/highbd_block_error_intrin_sse2.c
+++ b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 9cdf21fc7c..2fd6d1d289 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 158b4ae439..0e5d6923e2 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_temporal_filter_avx2.c b/av1/encoder/x86/highbd_temporal_filter_avx2.c
index ca448ca37b..6232e9e3c2 100644
--- a/av1/encoder/x86/highbd_temporal_filter_avx2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index 2032847083..b328dcac40 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/ml_avx2.c b/av1/encoder/x86/ml_avx2.c
index 6432708416..bed8f6d1ad 100644
--- a/av1/encoder/x86/ml_avx2.c
+++ b/av1/encoder/x86/ml_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index 4748a68d38..e5d5dead86 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/ml_sse3.h b/av1/encoder/x86/ml_sse3.h
index f41a2474af..93ae38459a 100644
--- a/av1/encoder/x86/ml_sse3.h
+++ b/av1/encoder/x86/ml_sse3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 1f76576c9e..57f0463ba5 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index af6706228a..d354b3d1b5 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index a0ab3940c0..96798576ef 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 76980d673a..6258fb0f7d 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index a492483721..b8cbe0ca08 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
index df7aa95855..f34efb6db5 100644
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 752d6f3f0b..d8868f5208 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 842d3b13c8..8a8c94719e 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/wedge_utils_avx2.c b/av1/encoder/x86/wedge_utils_avx2.c
index 3f61c023c8..0563e19f42 100644
--- a/av1/encoder/x86/wedge_utils_avx2.c
+++ b/av1/encoder/x86/wedge_utils_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
index c3005790f2..3e2e3835df 100644
--- a/av1/encoder/x86/wedge_utils_sse2.c
+++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index 83e88ba480..f8c13c68c0 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/ratectrl_rtc.h b/av1/ratectrl_rtc.h
index 1894469dd1..a4b33039cd 100644
--- a/av1/ratectrl_rtc.h
+++ b/av1/ratectrl_rtc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/args.c b/common/args.c
index c380dde8a0..74437e362d 100644
--- a/common/args.c
+++ b/common/args.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/args.h b/common/args.h
index 1c5c437632..74cf09ce69 100644
--- a/common/args.h
+++ b/common/args.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/args_helper.c b/common/args_helper.c
index 2201868335..e33f4c381b 100644
--- a/common/args_helper.c
+++ b/common/args_helper.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/args_helper.h b/common/args_helper.h
index c86a6128d3..f66a12c3c1 100644
--- a/common/args_helper.h
+++ b/common/args_helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/av1_config.c b/common/av1_config.c
index 9f5b02015b..6c6c2f9411 100644
--- a/common/av1_config.c
+++ b/common/av1_config.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/av1_config.h b/common/av1_config.h
index a15bedb305..6b01a35690 100644
--- a/common/av1_config.h
+++ b/common/av1_config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/ivfdec.c b/common/ivfdec.c
index 6e714d1cfe..8d616573c4 100644
--- a/common/ivfdec.c
+++ b/common/ivfdec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/ivfdec.h b/common/ivfdec.h
index e8fe8d0c53..842d1ce8a6 100644
--- a/common/ivfdec.h
+++ b/common/ivfdec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/ivfenc.c b/common/ivfenc.c
index 64715f4d74..2b93cb613f 100644
--- a/common/ivfenc.c
+++ b/common/ivfenc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/ivfenc.h b/common/ivfenc.h
index 8f6d947d47..2b1b81c717 100644
--- a/common/ivfenc.h
+++ b/common/ivfenc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/obudec.c b/common/obudec.c
index 8b7bd39a60..c57468bcbf 100644
--- a/common/obudec.c
+++ b/common/obudec.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/obudec.h b/common/obudec.h
index b2adb1e3d7..8519eeb597 100644
--- a/common/obudec.h
+++ b/common/obudec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/rawenc.c b/common/rawenc.c
index aa80d2cae3..1c64f0f3bd 100644
--- a/common/rawenc.c
+++ b/common/rawenc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/rawenc.h b/common/rawenc.h
index cf5e00e6fd..3d54962bfb 100644
--- a/common/rawenc.h
+++ b/common/rawenc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/tools_common.c b/common/tools_common.c
index db02ca6299..b2f9b6cad9 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/tools_common.h b/common/tools_common.h
index cde21646ed..222460971c 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/video_common.h b/common/video_common.h
index bf95031be6..d3728bc496 100644
--- a/common/video_common.h
+++ b/common/video_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/video_reader.c b/common/video_reader.c
index 27f69a9672..12d39ab934 100644
--- a/common/video_reader.c
+++ b/common/video_reader.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/video_reader.h b/common/video_reader.h
index 9ab439e8af..15c4c50d9e 100644
--- a/common/video_reader.h
+++ b/common/video_reader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/video_writer.c b/common/video_writer.c
index 1d4328ae1e..87a2ec0d28 100644
--- a/common/video_writer.c
+++ b/common/video_writer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/video_writer.h b/common/video_writer.h
index 8712d47a58..3e8bae1002 100644
--- a/common/video_writer.h
+++ b/common/video_writer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/warnings.c b/common/warnings.c
index a20531cb8b..0d741e1a80 100644
--- a/common/warnings.c
+++ b/common/warnings.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/warnings.h b/common/warnings.h
index 36f1fe0706..5aff6ade63 100644
--- a/common/warnings.h
+++ b/common/warnings.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/webmdec.cc b/common/webmdec.cc
index 33bda59021..2a953aeb7f 100644
--- a/common/webmdec.cc
+++ b/common/webmdec.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/webmdec.h b/common/webmdec.h
index fcbdeffe4d..76e2a48529 100644
--- a/common/webmdec.h
+++ b/common/webmdec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/webmenc.cc b/common/webmenc.cc
index bb754e8119..0b0ff65af9 100644
--- a/common/webmenc.cc
+++ b/common/webmenc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/webmenc.h b/common/webmenc.h
index c912208b45..d1819a34d4 100644
--- a/common/webmenc.h
+++ b/common/webmenc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/y4menc.c b/common/y4menc.c
index 25086a91d0..fb127aba61 100644
--- a/common/y4menc.c
+++ b/common/y4menc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/y4menc.h b/common/y4menc.h
index 6484efcc50..3333ac163d 100644
--- a/common/y4menc.h
+++ b/common/y4menc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/y4minput.c b/common/y4minput.c
index 6a8601edfb..038912eb10 100644
--- a/common/y4minput.c
+++ b/common/y4minput.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/y4minput.h b/common/y4minput.h
index 2472007b67..2e1c35e8cc 100644
--- a/common/y4minput.h
+++ b/common/y4minput.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/analyzer.cc b/examples/analyzer.cc
index 501f5024db..b43b1ea058 100644
--- a/examples/analyzer.cc
+++ b/examples/analyzer.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index b7fb7bce45..694593feff 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc
index 4634ca628a..dc9694e871 100644
--- a/examples/av1_dec_fuzzer.cc
+++ b/examples/av1_dec_fuzzer.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 07f788ff97..e398484be8 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index 9bec6ee2df..a5cd395e4f 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/encoder_util.c b/examples/encoder_util.c
index e43b372506..ea2728d886 100644
--- a/examples/encoder_util.c
+++ b/examples/encoder_util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/encoder_util.h b/examples/encoder_util.h
index fa0e7d1880..dfee802635 100644
--- a/examples/encoder_util.h
+++ b/examples/encoder_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/inspect.c b/examples/inspect.c
index e285be0209..dca274f1ea 100644
--- a/examples/inspect.c
+++ b/examples/inspect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/lightfield_bitstream_parsing.c b/examples/lightfield_bitstream_parsing.c
index 05272bafa3..7f0b8d251a 100644
--- a/examples/lightfield_bitstream_parsing.c
+++ b/examples/lightfield_bitstream_parsing.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/lightfield_decoder.c b/examples/lightfield_decoder.c
index 65b13efa1a..d77235fd1a 100644
--- a/examples/lightfield_decoder.c
+++ b/examples/lightfield_decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/lightfield_encoder.c b/examples/lightfield_encoder.c
index 9aef836ac2..63d72d4a4a 100644
--- a/examples/lightfield_encoder.c
+++ b/examples/lightfield_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/lightfield_tile_list_decoder.c b/examples/lightfield_tile_list_decoder.c
index d71ff5b387..dd6391e7ba 100644
--- a/examples/lightfield_tile_list_decoder.c
+++ b/examples/lightfield_tile_list_decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 1971b9c9df..194c717472 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/noise_model.c b/examples/noise_model.c
index 1de13267fc..1816e3370d 100644
--- a/examples/noise_model.c
+++ b/examples/noise_model.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/photon_noise_table.c b/examples/photon_noise_table.c
index d3a21a48ee..e64b336c23 100644
--- a/examples/photon_noise_table.c
+++ b/examples/photon_noise_table.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/scalable_decoder.c b/examples/scalable_decoder.c
index 00fe820fd5..e77423e0a0 100644
--- a/examples/scalable_decoder.c
+++ b/examples/scalable_decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/scalable_encoder.c b/examples/scalable_encoder.c
index 5bfd1840b2..91d35a28bb 100644
--- a/examples/scalable_encoder.c
+++ b/examples/scalable_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/set_maps.c b/examples/set_maps.c
index 2593faba34..4f41b4700b 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index b6891dcbba..4f702f0132 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index c026706555..b08f97be16 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index c751e9868c..58f103f119 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 //  This is an example demonstrating how to implement a multi-layer AOM
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 388f68bd4d..4d7c6537f1 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/stats/aomstats.c b/stats/aomstats.c
index a006ec030f..53ed4daac1 100644
--- a/stats/aomstats.c
+++ b/stats/aomstats.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/stats/aomstats.h b/stats/aomstats.h
index b9c71871a0..0347a5e31f 100644
--- a/stats/aomstats.h
+++ b/stats/aomstats.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/stats/rate_hist.c b/stats/rate_hist.c
index d79ebc5ad2..1b4f2ee001 100644
--- a/stats/rate_hist.c
+++ b/stats/rate_hist.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/stats/rate_hist.h b/stats/rate_hist.h
index 55b8c5d439..3041924489 100644
--- a/stats/rate_hist.h
+++ b/stats/rate_hist.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/accounting_test.cc b/test/accounting_test.cc
index 033499d13b..a861c59026 100644
--- a/test/accounting_test.cc
+++ b/test/accounting_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/acm_random.h b/test/acm_random.h
index 15e8c9cc2e..56b229e0e6 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index de16541281..b9c44c13b6 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index 8ec24aa686..c234cb54c3 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 081123cbe4..ea5f302a6b 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
index 0dfb912215..d9727911d6 100644
--- a/test/aom_image_test.cc
+++ b/test/aom_image_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aom_integer_test.cc b/test/aom_integer_test.cc
index fcbbfb4d48..43dbbd03e7 100644
--- a/test/aom_integer_test.cc
+++ b/test/aom_integer_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aom_mem_test.cc b/test/aom_mem_test.cc
index 849ba64435..be0be00e73 100644
--- a/test/aom_mem_test.cc
+++ b/test/aom_mem_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 674a883ea2..f97bad274f 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index f51444da4d..b38f038c7a 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_common_int_test.cc b/test/av1_common_int_test.cc
index dde2542e3d..fb8a6b0d23 100644
--- a/test/av1_common_int_test.cc
+++ b/test/av1_common_int_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_config_test.cc b/test/av1_config_test.cc
index 3ff816c163..1e89f9d4d6 100644
--- a/test/av1_config_test.cc
+++ b/test/av1_config_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index b6458b0ef9..ac723017bb 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 2c630b7dbb..aac8006e50 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_encoder_parms_get_to_decoder.cc b/test/av1_encoder_parms_get_to_decoder.cc
index 402e70c34a..b690f3b648 100644
--- a/test/av1_encoder_parms_get_to_decoder.cc
+++ b/test/av1_encoder_parms_get_to_decoder.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 59c44cad12..7048927cd9 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_external_partition_test.cc b/test/av1_external_partition_test.cc
index 88f6216fa5..10c130683e 100644
--- a/test/av1_external_partition_test.cc
+++ b/test/av1_external_partition_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_fwd_txfm1d_test.cc b/test/av1_fwd_txfm1d_test.cc
index 6bae9f8364..5a848cca32 100644
--- a/test/av1_fwd_txfm1d_test.cc
+++ b/test/av1_fwd_txfm1d_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index 4a5a634545..d93d59ad55 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 2c57362a82..649e367751 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index e9cf02e202..a337211f64 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_inv_txfm1d_test.cc b/test/av1_inv_txfm1d_test.cc
index e70b22a35a..13317f9ad8 100644
--- a/test/av1_inv_txfm1d_test.cc
+++ b/test/av1_inv_txfm1d_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index 35a87a43b8..866c13e8b8 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index 7e66a8e01d..ce08986e5f 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_key_value_api_test.cc b/test/av1_key_value_api_test.cc
index a5734f6beb..a25fbcf050 100644
--- a/test/av1_key_value_api_test.cc
+++ b/test/av1_key_value_api_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 4201ea6ce6..dc714920b7 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index c8af14a356..3e6697186b 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index 937e8645a5..67cacaf67d 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_softmax_test.cc b/test/av1_softmax_test.cc
index 2b04af1342..928b5857e5 100644
--- a/test/av1_softmax_test.cc
+++ b/test/av1_softmax_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_temporal_denoiser_test.cc b/test/av1_temporal_denoiser_test.cc
index 7aa8fb6a66..35a682aa10 100644
--- a/test/av1_temporal_denoiser_test.cc
+++ b/test/av1_temporal_denoiser_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_txfm_test.cc b/test/av1_txfm_test.cc
index 23e260b32d..5d633d32df 100644
--- a/test/av1_txfm_test.cc
+++ b/test/av1_txfm_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_txfm_test.h b/test/av1_txfm_test.h
index d285e3d637..337304d7ed 100644
--- a/test/av1_txfm_test.h
+++ b/test/av1_txfm_test.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index 2234561b7d..4967e9cb48 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 6f4c2ff332..21c34ccc36 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <stdlib.h>
diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index f3e2ef2af9..d75f7063d8 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/binary_codes_test.cc b/test/binary_codes_test.cc
index 2c2dfb45a8..e9dc85e88f 100644
--- a/test/binary_codes_test.cc
+++ b/test/binary_codes_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index f9549bccb2..e6e4de1ca2 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index fafc7f0329..3538677b43 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/block_test.cc b/test/block_test.cc
index 686180cf87..6714393ec3 100644
--- a/test/block_test.cc
+++ b/test/block_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc
index 52c58e0b2e..c36980cc7b 100644
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 594c3e8429..df793b6c95 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index ac0591f6a8..601b0384c6 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 7fdea04c36..bde346a378 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/cnn_test.cc b/test/cnn_test.cc
index e5114b56ce..f57e26d0f8 100644
--- a/test/cnn_test.cc
+++ b/test/cnn_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 7ffc465a7b..cf3b38beba 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
index f7b7eace90..79098376d4 100644
--- a/test/coding_path_sync.cc
+++ b/test/coding_path_sync.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 2f81d7e9b7..94a888713a 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index 396df2e2dd..fd8a8046a5 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index b65730aa57..6b70ca23ff 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 41e838ac6a..18cfc71170 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index 895c8ad7d3..76f5345950 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 972d800270..a6ab219ae0 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
index 53db8b0d13..46d97f444c 100644
--- a/test/cpu_used_firstpass_test.cc
+++ b/test/cpu_used_firstpass_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 9b73f79aed..facd86e36b 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 869c22150a..24ce832a6b 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 591a167e94..e07b3a30d7 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index 4e06f1afac..18086a2d6a 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 030035466c..ab80dbc264 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_scalability_test.cc b/test/decode_scalability_test.cc
index d66c8ec719..2102ddb8c2 100644
--- a/test/decode_scalability_test.cc
+++ b/test/decode_scalability_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index f44d670556..acfc882ce7 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 311898ecf0..7812d708ec 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/deltaq_mode_test.cc b/test/deltaq_mode_test.cc
index 5960d276d1..424af81405 100644
--- a/test/deltaq_mode_test.cc
+++ b/test/deltaq_mode_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/disflow_test.cc b/test/disflow_test.cc
index bee9e1261c..5c07b0758e 100644
--- a/test/disflow_test.cc
+++ b/test/disflow_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/divu_small_test.cc b/test/divu_small_test.cc
index 496fbc1f8e..7ddb22d8fd 100644
--- a/test/divu_small_test.cc
+++ b/test/divu_small_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 50d5320e8a..0938a3db11 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -1,5 +1,5 @@
 ï»¿/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/dropframe_encode_test.cc b/test/dropframe_encode_test.cc
index 4a54c0b95c..4e0feb330a 100644
--- a/test/dropframe_encode_test.cc
+++ b/test/dropframe_encode_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/ec_test.cc b/test/ec_test.cc
index a5284deac0..b60ccbf8b6 100644
--- a/test/ec_test.cc
+++ b/test/ec_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 379d8d6821..36e79145f6 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index b52cf3392c..9ee7ab05d5 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
index 22f69396d9..893e5ede35 100644
--- a/test/encode_small_width_height_test.cc
+++ b/test/encode_small_width_height_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index b5c506c6d3..fb5ddcae7a 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index d1e6615cd7..9a66e4424f 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encodemb_test.cc b/test/encodemb_test.cc
index 6165fc33f5..f84b23efb7 100644
--- a/test/encodemb_test.cc
+++ b/test/encodemb_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 49b0fba94a..30cf053d5b 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/end_to_end_psnr_test.cc b/test/end_to_end_psnr_test.cc
index 687308da8c..722ea54dd5 100644
--- a/test/end_to_end_psnr_test.cc
+++ b/test/end_to_end_psnr_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/end_to_end_qmpsnr_test.cc b/test/end_to_end_qmpsnr_test.cc
index 7a755a7a51..e5d8afd88b 100644
--- a/test/end_to_end_qmpsnr_test.cc
+++ b/test/end_to_end_qmpsnr_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/end_to_end_ssim_test.cc b/test/end_to_end_ssim_test.cc
index f1b0cae75f..ee13e0fc24 100644
--- a/test/end_to_end_ssim_test.cc
+++ b/test/end_to_end_ssim_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index e7cd870a98..45d363e16e 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index d41884df2b..ef10ee85fc 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 415f5de269..fd5f3dd0ce 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 0bf0f6bdc6..5f4fb9f324 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 9cbf208adb..1fb3a9a214 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/fft_test.cc b/test/fft_test.cc
index 06a17a3f8f..b110601af7 100644
--- a/test/fft_test.cc
+++ b/test/fft_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc
index 2c6906f73b..ca6f1be9e0 100644
--- a/test/film_grain_table_test.cc
+++ b/test/film_grain_table_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index 0a0ab11dc3..5d9b839c1d 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/firstpass_test.cc b/test/firstpass_test.cc
index 1f4f3b7853..020f6b3b03 100644
--- a/test/firstpass_test.cc
+++ b/test/firstpass_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/force_key_frame_test.cc b/test/force_key_frame_test.cc
index 2b85d26530..f00b80307d 100644
--- a/test/force_key_frame_test.cc
+++ b/test/force_key_frame_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/forced_max_frame_width_height_test.cc b/test/forced_max_frame_width_height_test.cc
index 3347713c5b..ccde456d1b 100644
--- a/test/forced_max_frame_width_height_test.cc
+++ b/test/forced_max_frame_width_height_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/frame_parallel_enc_test.cc b/test/frame_parallel_enc_test.cc
index 86d5ddb7d4..cb694296af 100644
--- a/test/frame_parallel_enc_test.cc
+++ b/test/frame_parallel_enc_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index cff353a294..029a823c1f 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index ea8cf47ab8..32d9a45514 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
index 2268b9f2ad..e8f9835253 100644
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index bb9e218f6f..630bd141c7 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/gf_pyr_height_test.cc b/test/gf_pyr_height_test.cc
index 0996d80c25..a38330bbbf 100644
--- a/test/gf_pyr_height_test.cc
+++ b/test/gf_pyr_height_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index b01e78faaa..aa9ceb08ad 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <algorithm>
diff --git a/test/hash_test.cc b/test/hash_test.cc
index a1de9323db..5fd68a52ae 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 59bca9bcec..7650f7f44d 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 78883ccddf..1eba7cd6ae 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/hiprec_convolve_test_util.cc b/test/hiprec_convolve_test_util.cc
index 6d7902fd04..21ca10725f 100644
--- a/test/hiprec_convolve_test_util.cc
+++ b/test/hiprec_convolve_test_util.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/hiprec_convolve_test_util.h b/test/hiprec_convolve_test_util.h
index beae5c729b..d0ce96ec5d 100644
--- a/test/hiprec_convolve_test_util.h
+++ b/test/hiprec_convolve_test_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 5e397ffdf7..760461e3f0 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 595ed548c7..963df87c7d 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/i420_video_source.h b/test/i420_video_source.h
index 233e7152b9..f5e85a07c9 100644
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index 96ee65466b..eb56b627d8 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/intrabc_test.cc b/test/intrabc_test.cc
index 2c60596ab8..301cc6dd05 100644
--- a/test/intrabc_test.cc
+++ b/test/intrabc_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 8796e8ba69..157ad26e8a 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 791cdb8928..ca173959db 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 85731f5566..ed92e73137 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 7d8cbfe8c6..14fc064efe 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/level_test.cc b/test/level_test.cc
index 6d59f45272..637e3e0ef5 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/log2_test.cc b/test/log2_test.cc
index 71cf8b25d9..3b1c979231 100644
--- a/test/log2_test.cc
+++ b/test/log2_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/loopfilter_control_test.cc b/test/loopfilter_control_test.cc
index 9c00235e19..d120aaa6fe 100644
--- a/test/loopfilter_control_test.cc
+++ b/test/loopfilter_control_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index 756ad05019..fff874ca86 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 04b1c86d4d..998ebb8962 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index bb037460d1..0d5986cdb8 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 8482a12f53..14d36d8b25 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/md5_helper.h b/test/md5_helper.h
index 69f1ae76b0..c3e98b8f09 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index 9467c29e86..e1538f6962 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 33be4ff6dc..dd88541d0c 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -1,6 +1,6 @@
 /*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ * Copyright (c) 2023 The WebM project authors. All rights reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
  *
  *  This source code is subject to the terms of the BSD 2 Clause License and
  *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index f22b5fe0f2..1989131095 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/motion_vector_test.cc b/test/motion_vector_test.cc
index 4fc8d53d95..2d897caf7a 100644
--- a/test/motion_vector_test.cc
+++ b/test/motion_vector_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/mv_cost_test.cc b/test/mv_cost_test.cc
index 73d56665bf..a0aed58495 100644
--- a/test/mv_cost_test.cc
+++ b/test/mv_cost_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 87f607c155..2ef5dc2897 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 967b677666..0f01e90702 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index 5f21a8a6c1..a629d3c7dd 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 04b6f45652..3ab22e1706 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/postproc_filters_test.cc b/test/postproc_filters_test.cc
index 9584dd8c35..f907aef278 100644
--- a/test/postproc_filters_test.cc
+++ b/test/postproc_filters_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/quant_test.cc b/test/quant_test.cc
index afbabb3147..40ca470e66 100644
--- a/test/quant_test.cc
+++ b/test/quant_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 61f26ea57f..bebf72fc32 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index cc054b6926..66911152e6 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/ratectrl_test.cc b/test/ratectrl_test.cc
index d951b1197f..259f12300e 100644
--- a/test/ratectrl_test.cc
+++ b/test/ratectrl_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/rd_test.cc b/test/rd_test.cc
index 0c481fcbb6..8c9a54c476 100644
--- a/test/rd_test.cc
+++ b/test/rd_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index ee1a9893db..772cb9b123 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 4aad81469e..b70fb503d3 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/resize_test.cc b/test/resize_test.cc
index a84a4654a8..1682f9fd83 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index f1f9e019c2..2d07f0fd7b 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 64cf8006be..cd29a6f1f0 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index e27a2c60ee..f70ebd2ec0 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sb_qp_sweep_test.cc b/test/sb_qp_sweep_test.cc
index 6c76a40b2a..5555d20e26 100644
--- a/test/sb_qp_sweep_test.cc
+++ b/test/sb_qp_sweep_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/scalability_test.cc b/test/scalability_test.cc
index 12cb03cac4..6196b3e177 100644
--- a/test/scalability_test.cc
+++ b/test/scalability_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/scan_test.cc b/test/scan_test.cc
index 3ba39de3d5..54a05ead53 100644
--- a/test/scan_test.cc
+++ b/test/scan_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/screen_content_test.cc b/test/screen_content_test.cc
index 974c50b3c6..681843f692 100644
--- a/test/screen_content_test.cc
+++ b/test/screen_content_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/segment_binarization_sync.cc b/test/segment_binarization_sync.cc
index 108e66a838..db866cfc65 100644
--- a/test/segment_binarization_sync.cc
+++ b/test/segment_binarization_sync.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 3dd513b6e0..9f793af573 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
index 054fbcc660..5002a4469d 100644
--- a/test/sharpness_test.cc
+++ b/test/sharpness_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_avx2_test.cc b/test/simd_avx2_test.cc
index 8a012bff88..378b9957f1 100644
--- a/test/simd_avx2_test.cc
+++ b/test/simd_avx2_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_cmp_avx2.cc b/test/simd_cmp_avx2.cc
index cda632bcdf..53aa48c15e 100644
--- a/test/simd_cmp_avx2.cc
+++ b/test/simd_cmp_avx2.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_cmp_impl.h b/test/simd_cmp_impl.h
index cf85a471cd..4bdf4975d3 100644
--- a/test/simd_cmp_impl.h
+++ b/test/simd_cmp_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_cmp_sse2.cc b/test/simd_cmp_sse2.cc
index f7827a7fa1..a5676d5e93 100644
--- a/test/simd_cmp_sse2.cc
+++ b/test/simd_cmp_sse2.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_cmp_sse4.cc b/test/simd_cmp_sse4.cc
index 3566764b64..f66aa11646 100644
--- a/test/simd_cmp_sse4.cc
+++ b/test/simd_cmp_sse4.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_cmp_ssse3.cc b/test/simd_cmp_ssse3.cc
index 57bf135ddb..3775fdcc0b 100644
--- a/test/simd_cmp_ssse3.cc
+++ b/test/simd_cmp_ssse3.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_impl.h b/test/simd_impl.h
index b564a7f4b3..d055ebae02 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_sse2_test.cc b/test/simd_sse2_test.cc
index b37a931b38..98c9399a27 100644
--- a/test/simd_sse2_test.cc
+++ b/test/simd_sse2_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_sse4_test.cc b/test/simd_sse4_test.cc
index b1c9d5cd88..bb6954661b 100644
--- a/test/simd_sse4_test.cc
+++ b/test/simd_sse4_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simd_ssse3_test.cc b/test/simd_ssse3_test.cc
index d95c26fb5e..13d0ca83c5 100644
--- a/test/simd_ssse3_test.cc
+++ b/test/simd_ssse3_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index fd6fb886d3..0dba47d452 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/still_picture_test.cc b/test/still_picture_test.cc
index 3dfb1c8693..2908a5d45a 100644
--- a/test/still_picture_test.cc
+++ b/test/still_picture_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index e591e6543d..1f454b6bf7 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 7b98ced523..e2c455653d 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 16fbb0bd3e..2284ce524c 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 85f68b817e..52e2366aa8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_aom_rc.cc b/test/test_aom_rc.cc
index 0182b62ec8..2252443e1a 100644
--- a/test/test_aom_rc.cc
+++ b/test/test_aom_rc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index d5c94be092..b5dbadd391 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_libaom.cc b/test/test_libaom.cc
index 26abbb0a06..2a29b45ca0 100644
--- a/test/test_libaom.cc
+++ b/test/test_libaom.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 39414e32e4..fd9300200f 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 09736d1ed8..bb8f0fbc34 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test_vectors.h b/test/test_vectors.h
index be37f6e377..34b12b21d1 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index e2ac59284b..4bd4a67b4f 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 84406dd3fb..fcfcbce783 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index 5de98b719e..17320415a2 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/tpl_model_test.cc b/test/tpl_model_test.cc
index 91eb5e94d3..4f15fb571d 100644
--- a/test/tpl_model_test.cc
+++ b/test/tpl_model_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 55e78fef48..92385c3e68 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/util.h b/test/util.h
index 29df709c4f..e043d87097 100644
--- a/test/util.h
+++ b/test/util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 261c080028..283c174308 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/video_source.h b/test/video_source.h
index 9d73d7b253..da51f6a2c4 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 8844ba77ca..bade6799b9 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index b7c60c2fdb..9dadbaf22f 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index 364368ac0c..b2f370e614 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 845abd6dce..71bf914dec 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/webmenc_test.cc b/test/webmenc_test.cc
index acd795f2ec..eba0760ee2 100644
--- a/test/webmenc_test.cc
+++ b/test/webmenc_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 77d2769aaa..d0bd045e3d 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index a4ed13f7c5..beec69c589 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 1369e4e280..2af61d9db9 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 77d5dfa73c..daa45ca05d 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/aom_entropy_optimizer.c b/tools/aom_entropy_optimizer.c
index fa7bf7ea9e..2961db728b 100644
--- a/tools/aom_entropy_optimizer.c
+++ b/tools/aom_entropy_optimizer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/decl_status_code.c b/tools/auto_refactor/c_files/decl_status_code.c
index a444553bb1..f8bd1ea277 100644
--- a/tools/auto_refactor/c_files/decl_status_code.c
+++ b/tools/auto_refactor/c_files/decl_status_code.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/func_in_out.c b/tools/auto_refactor/c_files/func_in_out.c
index 7f37bbae7e..89a25e82e9 100644
--- a/tools/auto_refactor/c_files/func_in_out.c
+++ b/tools/auto_refactor/c_files/func_in_out.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/global_variable.c b/tools/auto_refactor/c_files/global_variable.c
index 26d5385e97..5173966a59 100644
--- a/tools/auto_refactor/c_files/global_variable.c
+++ b/tools/auto_refactor/c_files/global_variable.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/parse_lvalue.c b/tools/auto_refactor/c_files/parse_lvalue.c
index fa44d72381..081e69f1fc 100644
--- a/tools/auto_refactor/c_files/parse_lvalue.c
+++ b/tools/auto_refactor/c_files/parse_lvalue.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/simple_code.c b/tools/auto_refactor/c_files/simple_code.c
index 902cd1d826..09d7685476 100644
--- a/tools/auto_refactor/c_files/simple_code.c
+++ b/tools/auto_refactor/c_files/simple_code.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/c_files/struct_code.c b/tools/auto_refactor/c_files/struct_code.c
index 7f24d41075..2c17553d24 100644
--- a/tools/auto_refactor/c_files/struct_code.c
+++ b/tools/auto_refactor/c_files/struct_code.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/dump_obu.cc b/tools/dump_obu.cc
index b9ff985c44..f1f6c62309 100644
--- a/tools/dump_obu.cc
+++ b/tools/dump_obu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/obu_parser.cc b/tools/obu_parser.cc
index 4053615f11..224ebca9da 100644
--- a/tools/obu_parser.cc
+++ b/tools/obu_parser.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/obu_parser.h b/tools/obu_parser.h
index 1d7d2d794b..6c4cda4308 100644
--- a/tools/obu_parser.h
+++ b/tools/obu_parser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/txfm_analyzer/txfm_gen_code.cc b/tools/txfm_analyzer/txfm_gen_code.cc
index 7c5400b91a..1f1ba4359c 100644
--- a/tools/txfm_analyzer/txfm_gen_code.cc
+++ b/tools/txfm_analyzer/txfm_gen_code.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/txfm_analyzer/txfm_graph.cc b/tools/txfm_analyzer/txfm_graph.cc
index a249061008..f46cc8faa8 100644
--- a/tools/txfm_analyzer/txfm_graph.cc
+++ b/tools/txfm_analyzer/txfm_graph.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/txfm_analyzer/txfm_graph.h b/tools/txfm_analyzer/txfm_graph.h
index 8dc36146dd..5a66632fd0 100644
--- a/tools/txfm_analyzer/txfm_graph.h
+++ b/tools/txfm_analyzer/txfm_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-- 
GitLab


From 782840ba6d8db0c4e6fb4a16f130106b15ee2871 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 28 May 2021 11:32:44 -0700
Subject: [PATCH 219/391] Set obu_extension_flag correctly for frame OBUs

Add the has_nonzero_operating_point_idc boolean member to the
SequenceHeader struct. Pass the is_layer_specific_obu and
has_nonzero_operating_point_idc boolean parameters to
av1_write_obu_header().

This CL changes only the encoder side of libaom and is backward
compatible with current AV1 decoders.

Bug: aomedia:3076, aomedia:3582

Change-Id: I47bfe94e9d3f18005f4d574a7a4fe9eb963ae31f
---
 av1/av1_cx_iface.c          |  9 ++++---
 av1/common/av1_common_int.h |  5 ++++
 av1/decoder/obu.c           |  5 ++++
 av1/encoder/bitstream.c     | 53 ++++++++++++++++++++++++++++---------
 av1/encoder/bitstream.h     |  7 ++++-
 av1/encoder/encoder.c       | 13 ++++++---
 6 files changed, 72 insertions(+), 20 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 6f9125e6a5..6205257ad8 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <limits.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -3352,9 +3353,11 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
 
         const size_t move_offset = obu_header_size + length_field_size;
         memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size);
-        obu_header_size =
-            av1_write_obu_header(&ppi->level_params, &cpi->frame_header_count,
-                                 OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
+        obu_header_size = av1_write_obu_header(
+            &ppi->level_params, &cpi->frame_header_count,
+            OBU_TEMPORAL_DELIMITER,
+            /*is_layer_specific_obu=*/false,
+            ppi->seq_params.has_nonzero_operating_point_idc, 0, ctx->cx_data);
 
         // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 857a2ea3db..17afbf40dd 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
 #define AOM_AV1_COMMON_AV1_COMMON_INT_H_
 
+#include <stdbool.h>
+
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
@@ -317,6 +319,9 @@ typedef struct SequenceHeader {
   // Operating point info.
   int operating_points_cnt_minus_1;
   int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  // True if operating_point_idc[op] is not equal to 0 for any value of op from
+  // 0 to operating_points_cnt_minus_1.
+  bool has_nonzero_operating_point_idc;
   int timing_info_present;
   aom_timing_info_t timing_info;
   uint8_t decoder_model_info_present_flag;
diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c
index fb1b0e8156..6d6aa41945 100644
--- a/av1/decoder/obu.c
+++ b/av1/decoder/obu.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <stdbool.h>
 
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
@@ -134,6 +135,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     seq_params->display_model_info_present_flag = 0;
     seq_params->operating_points_cnt_minus_1 = 0;
     seq_params->operating_point_idc[0] = 0;
+    seq_params->has_nonzero_operating_point_idc = false;
     if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
       pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
       return 0;
@@ -155,9 +157,12 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
     seq_params->operating_points_cnt_minus_1 =
         aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    seq_params->has_nonzero_operating_point_idc = false;
     for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
       seq_params->operating_point_idc[i] =
           aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (seq_params->operating_point_idc[i] != 0)
+        seq_params->has_nonzero_operating_point_idc = true;
       if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
         pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         return 0;
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index e485faa73e..4f5ec12d7d 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 #include <limits.h>
+#include <stdbool.h>
 #include <stdio.h>
 
 #include "aom/aom_encoder.h"
@@ -3355,21 +3356,28 @@ static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
 
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
                               int *frame_header_count, OBU_TYPE obu_type,
+                              bool is_layer_specific_obu,
+                              bool has_nonzero_operating_point_idc,
                               int obu_extension, uint8_t *const dst) {
+  assert(IMPLIES(!is_layer_specific_obu, obu_extension == 0));
+  assert(IMPLIES(!has_nonzero_operating_point_idc, obu_extension == 0));
+
   if (level_params->keep_level_stats &&
       (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
     ++(*frame_header_count);
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
+  int obu_extension_flag =
+      has_nonzero_operating_point_idc && is_layer_specific_obu;
 
   aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
   aom_wb_write_literal(&wb, (int)obu_type, 4);
-  aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, obu_extension_flag, 1);
   aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
   aom_wb_write_literal(&wb, 0, 1);  // reserved
 
-  if (obu_extension) {
+  if (obu_extension_flag) {
     aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
   }
 
@@ -3536,8 +3544,14 @@ static uint32_t init_large_scale_tile_obu_header(
   // For large_scale_tile case, we always have only one tile group, so it can
   // be written as an OBU_FRAME.
   const OBU_TYPE obu_type = OBU_FRAME;
+  // We pass obu_extension=0 to av1_write_obu_header(), so
+  // has_nonzero_operating_point_idc must be false.
+  assert(!cpi->common.seq_params->has_nonzero_operating_point_idc);
   lst_obu->tg_hdr_size = av1_write_obu_header(
-      level_params, &cpi->frame_header_count, obu_type, 0, *data);
+      level_params, &cpi->frame_header_count, obu_type,
+      /*is_layer_specific_obu=*/true,
+      cpi->common.seq_params->has_nonzero_operating_point_idc,
+      /*obu_extension=*/0, *data);
   *data += lst_obu->tg_hdr_size;
 
   const uint32_t frame_header_size =
@@ -3735,6 +3749,8 @@ void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
   const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
   *curr_tg_hdr_size = av1_write_obu_header(
       &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+      /*is_layer_specific_obu=*/true,
+      cm->seq_params->has_nonzero_operating_point_idc,
       pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
   pack_bs_params->obu_header_size = *curr_tg_hdr_size;
 
@@ -3833,9 +3849,12 @@ void av1_write_last_tile_info(
 
     // Rewrite the OBU header to change the OBU type to Redundant Frame
     // Header.
-    av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
-                         OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
-                         &curr_tg_start[fh_info->obu_header_byte_offset]);
+    av1_write_obu_header(
+        &cpi->ppi->level_params, &cpi->frame_header_count,
+        OBU_REDUNDANT_FRAME_HEADER,
+        /*is_layer_specific_obu=*/true,
+        cpi->common.seq_params->has_nonzero_operating_point_idc,
+        obu_extn_header, &curr_tg_start[fh_info->obu_header_byte_offset]);
 
     *curr_tg_data_size += (int)(fh_info->total_length);
     *total_size += (uint32_t)(fh_info->total_length);
@@ -4134,9 +4153,13 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
           (cm->current_frame.frame_type != KEY_FRAME &&
            current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
           current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
-        obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
-                                               &cpi->frame_header_count,
-                                               OBU_METADATA, 0, dst);
+        // Whether METADATA_TYPE_ITUT_T35 is layer-specific or not is
+        // payload-specific. Other metadata types are not layer-specific.
+        const bool is_layer_specific_obu = false;
+        obu_header_size = av1_write_obu_header(
+            &cpi->ppi->level_params, &cpi->frame_header_count, OBU_METADATA,
+            is_layer_specific_obu,
+            cm->seq_params->has_nonzero_operating_point_idc, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
         length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
@@ -4185,7 +4208,9 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
       cm->current_frame.frame_type == KEY_FRAME) {
     obu_header_size = av1_write_obu_header(
-        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
+        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER,
+        /*is_layer_specific_obu=*/false,
+        cm->seq_params->has_nonzero_operating_point_idc, 0, data);
     obu_payload_size =
         av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
@@ -4208,9 +4233,11 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
-    obu_header_size =
-        av1_write_obu_header(level_params, &cpi->frame_header_count,
-                             OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_header_size = av1_write_obu_header(
+        level_params, &cpi->frame_header_count, OBU_FRAME_HEADER,
+        /*is_layer_specific_obu=*/true,
+        cm->seq_params->has_nonzero_operating_point_idc, obu_extension_header,
+        data);
     obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
                                               data + obu_header_size, 1);
 
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index f0b0fd0acb..232c43040e 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -16,6 +16,8 @@
 extern "C" {
 #endif
 
+#include <stdbool.h>
+
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
@@ -89,9 +91,12 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
                                        uint8_t *const dst);
 
 // Writes the OBU header byte, and the OBU header extension byte when
-// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+// has_nonzero_operating_point_idc is true and the OBU is layer-specific.
+// Returns number of bytes written to 'dst'.
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
                               int *frame_header_count, OBU_TYPE obu_type,
+                              bool is_layer_specific_obu,
+                              bool has_nonzero_operating_point_idc,
                               int obu_extension, uint8_t *const dst);
 
 int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index c4c333f624..1a132fe88e 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -9,12 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <limits.h>
+#include <assert.h>
 #include <float.h>
+#include <limits.h>
 #include <math.h>
+#include <stdbool.h>
 #include <stdio.h>
-#include <time.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "av1/common/scale.h"
 #include "config/aom_config.h"
@@ -547,6 +549,7 @@ void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
 
   if (seq->operating_points_cnt_minus_1 == 0) {
     seq->operating_point_idc[0] = 0;
+    seq->has_nonzero_operating_point_idc = false;
   } else {
     // Set operating_point_idc[] such that the i=0 point corresponds to the
     // highest quality operating point (all layers), and subsequent
@@ -555,11 +558,13 @@ void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
     int i = 0;
     assert(seq->operating_points_cnt_minus_1 ==
            (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+    seq->has_nonzero_operating_point_idc = true;
     for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
       for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
         seq->operating_point_idc[i] =
             (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
             ~(~0u << (ppi->number_temporal_layers - tl));
+        assert(seq->operating_point_idc[i] != 0);
         i++;
       }
     }
@@ -5387,7 +5392,9 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
   if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
-                           OBU_SEQUENCE_HEADER, 0,
+                           OBU_SEQUENCE_HEADER,
+                           /*is_layer_specific_obu=*/false,
+                           ppi->seq_params.has_nonzero_operating_point_idc, 0,
                            &header_buf[0]) != obu_header_size) {
     return NULL;
   }
-- 
GitLab


From f4ad7fc3241f80b522481bb8a0aafb9ea1dbe8b2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Jun 2024 16:37:08 -0700
Subject: [PATCH 220/391] *.{asm,cmake,inc,pl,py,txt}: normalize copyright line

Bug: aomedia:3525
Change-Id: Id6f0d40772e29fd330ad8466da42cd81f0b8d97a
---
 CMakeLists.txt                                      | 2 +-
 aom_dsp/aom_dsp.cmake                               | 2 +-
 aom_dsp/aom_dsp_rtcd_defs.pl                        | 2 +-
 aom_dsp/x86/aom_high_subpixel_8t_sse2.asm           | 2 +-
 aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm     | 2 +-
 aom_dsp/x86/aom_subpixel_8t_ssse3.asm               | 2 +-
 aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm         | 2 +-
 aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm               | 2 +-
 aom_dsp/x86/highbd_intrapred_asm_sse2.asm           | 2 +-
 aom_dsp/x86/highbd_sad4d_sse2.asm                   | 2 +-
 aom_dsp/x86/highbd_sad_sse2.asm                     | 2 +-
 aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm    | 2 +-
 aom_dsp/x86/highbd_variance_impl_sse2.asm           | 2 +-
 aom_dsp/x86/intrapred_asm_sse2.asm                  | 2 +-
 aom_dsp/x86/inv_wht_sse2.asm                        | 2 +-
 aom_dsp/x86/quantize_ssse3_x86_64.asm               | 2 +-
 aom_dsp/x86/sad4d_sse2.asm                          | 2 +-
 aom_dsp/x86/sad_sse2.asm                            | 2 +-
 aom_dsp/x86/ssim_sse2_x86_64.asm                    | 2 +-
 aom_dsp/x86/subpel_variance_ssse3.asm               | 2 +-
 aom_dsp/x86/subtract_sse2.asm                       | 2 +-
 aom_mem/aom_mem.cmake                               | 2 +-
 aom_ports/aom_ports.cmake                           | 2 +-
 aom_ports/float.asm                                 | 2 +-
 aom_ports/x86_abi_support.asm                       | 2 +-
 aom_scale/aom_scale.cmake                           | 2 +-
 aom_scale/aom_scale_rtcd.pl                         | 2 +-
 aom_util/aom_util.cmake                             | 2 +-
 av1/av1.cmake                                       | 2 +-
 av1/common/av1_rtcd_defs.pl                         | 2 +-
 av1/common/reconinter_template.inc                  | 2 +-
 av1/encoder/x86/av1_quantize_ssse3_x86_64.asm       | 2 +-
 av1/encoder/x86/av1_ssim_opt_x86_64.asm             | 2 +-
 av1/encoder/x86/dct_sse2.asm                        | 2 +-
 av1/encoder/x86/error_sse2.asm                      | 2 +-
 build/cmake/aom_config.c.template                   | 2 +-
 build/cmake/aom_config_defaults.cmake               | 2 +-
 build/cmake/aom_configure.cmake                     | 2 +-
 build/cmake/aom_experiment_deps.cmake               | 2 +-
 build/cmake/aom_install.cmake                       | 2 +-
 build/cmake/aom_optimization.cmake                  | 2 +-
 build/cmake/compiler_flags.cmake                    | 2 +-
 build/cmake/compiler_tests.cmake                    | 2 +-
 build/cmake/cpu.cmake                               | 2 +-
 build/cmake/dist.cmake                              | 2 +-
 build/cmake/exports.cmake                           | 2 +-
 build/cmake/exports_sources.cmake                   | 2 +-
 build/cmake/generate_aom_config_templates.cmake     | 8 ++++----
 build/cmake/generate_exports.cmake                  | 2 +-
 build/cmake/pkg_config.cmake                        | 2 +-
 build/cmake/rtcd.pl                                 | 2 +-
 build/cmake/sanitizers.cmake                        | 2 +-
 build/cmake/toolchains/android.cmake                | 2 +-
 build/cmake/toolchains/arm-ios-common.cmake         | 2 +-
 build/cmake/toolchains/arm64-ios.cmake              | 2 +-
 build/cmake/toolchains/arm64-linux-clang.cmake      | 2 +-
 build/cmake/toolchains/arm64-linux-gcc.cmake        | 2 +-
 build/cmake/toolchains/arm64-macos.cmake            | 2 +-
 build/cmake/toolchains/arm64-mingw-gcc.cmake        | 2 +-
 build/cmake/toolchains/armv7-ios.cmake              | 2 +-
 build/cmake/toolchains/armv7-linux-gcc.cmake        | 2 +-
 build/cmake/toolchains/armv7-mingw-gcc.cmake        | 2 +-
 build/cmake/toolchains/armv7s-ios.cmake             | 2 +-
 build/cmake/toolchains/i686-linux-gcc.cmake         | 2 +-
 build/cmake/toolchains/ios-simulator-common.cmake   | 2 +-
 build/cmake/toolchains/ppc-linux-gcc.cmake          | 2 +-
 build/cmake/toolchains/riscv-linux-gcc.cmake        | 2 +-
 build/cmake/toolchains/x86-ios-simulator.cmake      | 2 +-
 build/cmake/toolchains/x86-linux.cmake              | 2 +-
 build/cmake/toolchains/x86-macos.cmake              | 2 +-
 build/cmake/toolchains/x86-mingw-gcc.cmake          | 2 +-
 build/cmake/toolchains/x86_64-ios-simulator.cmake   | 2 +-
 build/cmake/toolchains/x86_64-macos.cmake           | 2 +-
 build/cmake/toolchains/x86_64-mingw-gcc.cmake       | 2 +-
 build/cmake/util.cmake                              | 2 +-
 build/cmake/version.cmake                           | 2 +-
 build/cmake/version.pl                              | 4 ++--
 common/ivf_dec.cmake                                | 2 +-
 docs.cmake                                          | 2 +-
 examples/build_av1_dec_fuzzer.sh                    | 2 +-
 libs.doxy_template                                  | 2 +-
 test/aomcx_set_ref.sh                               | 2 +-
 test/aomdec.sh                                      | 2 +-
 test/aomenc.sh                                      | 2 +-
 test/av1_c_vs_simd_encode.sh                        | 2 +-
 test/best_encode.sh                                 | 2 +-
 test/decode_to_md5.sh                               | 2 +-
 test/decode_with_drops.sh                           | 2 +-
 test/dump_obu.sh                                    | 2 +-
 test/examples.sh                                    | 2 +-
 test/gviz_api.py                                    | 2 +-
 test/lightfield_test.sh                             | 2 +-
 test/set_maps.sh                                    | 2 +-
 test/simple_decoder.sh                              | 2 +-
 test/simple_encoder.sh                              | 2 +-
 test/svc_encoder_rtc.sh                             | 2 +-
 test/test.cmake                                     | 2 +-
 test/test_data_download_worker.cmake                | 2 +-
 test/test_data_util.cmake                           | 2 +-
 test/test_runner.cmake                              | 2 +-
 test/tools_common.sh                                | 2 +-
 test/twopass_encoder.sh                             | 2 +-
 test/visual_metrics.py                              | 2 +-
 tools/aggregate_entropy_stats.py                    | 2 +-
 tools/auto_refactor/auto_refactor.py                | 2 +-
 tools/auto_refactor/av1_preprocess.py               | 2 +-
 tools/auto_refactor/test_auto_refactor.py           | 2 +-
 tools/diff.py                                       | 2 +-
 tools/gen_constrained_tokenset.py                   | 2 +-
 tools/intersect-diffs.py                            | 2 +-
 tools/lint-hunks.py                                 | 2 +-
 tools/ratectrl_log_analyzer/analyze_ratectrl_log.py | 2 +-
 tools/wrap-commit-msg.py                            | 2 +-
 113 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 328b7242ce..47f2a8ccf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 750df42641..85947c5a50 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b75bdc5a19..ed81c58bab 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1,5 +1,5 @@
 ##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index f84f8fa1f7..82ce640f2b 100644
--- a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index db4cad9bcb..4b3a33c833 100644
--- a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index e5fafb0302..4e76160cd0 100644
--- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index 253bc26d38..fd4dad7616 100644
--- a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 06879040b0..00c0decaae 100644
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
index 91b3d126ca..2a56be63e0 100644
--- a/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
+++ b/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_sad4d_sse2.asm b/aom_dsp/x86/highbd_sad4d_sse2.asm
index 03839b493c..d9d25bb165 100644
--- a/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 3dc4e4e0a2..7aa44ea828 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index c0ccc182b4..c9ebc902e1 100644
--- a/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/highbd_variance_impl_sse2.asm b/aom_dsp/x86/highbd_variance_impl_sse2.asm
index ec6c7e9fa7..7fc811b912 100644
--- a/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/intrapred_asm_sse2.asm b/aom_dsp/x86/intrapred_asm_sse2.asm
index 0eb632326b..d6c3349b2c 100644
--- a/aom_dsp/x86/intrapred_asm_sse2.asm
+++ b/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/inv_wht_sse2.asm b/aom_dsp/x86/inv_wht_sse2.asm
index 0bc841a7a4..7ad3fbf3f5 100644
--- a/aom_dsp/x86/inv_wht_sse2.asm
+++ b/aom_dsp/x86/inv_wht_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/quantize_ssse3_x86_64.asm b/aom_dsp/x86/quantize_ssse3_x86_64.asm
index fa616a6f1a..f5998e7e01 100644
--- a/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index 6edad99516..33842150ee 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/sad_sse2.asm b/aom_dsp/x86/sad_sse2.asm
index dbe8ca3161..0209add0e3 100644
--- a/aom_dsp/x86/sad_sse2.asm
+++ b/aom_dsp/x86/sad_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/ssim_sse2_x86_64.asm b/aom_dsp/x86/ssim_sse2_x86_64.asm
index 49bc655336..f95e834353 100644
--- a/aom_dsp/x86/ssim_sse2_x86_64.asm
+++ b/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/subpel_variance_ssse3.asm b/aom_dsp/x86/subpel_variance_ssse3.asm
index f424ce01dd..ffe718f99c 100644
--- a/aom_dsp/x86/subpel_variance_ssse3.asm
+++ b/aom_dsp/x86/subpel_variance_ssse3.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_dsp/x86/subtract_sse2.asm b/aom_dsp/x86/subtract_sse2.asm
index fd508c0916..8502de15c3 100644
--- a/aom_dsp/x86/subtract_sse2.asm
+++ b/aom_dsp/x86/subtract_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_mem/aom_mem.cmake b/aom_mem/aom_mem.cmake
index 346588d2db..6a44958dec 100644
--- a/aom_mem/aom_mem.cmake
+++ b/aom_mem/aom_mem.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake
index 6df2bf020b..1746efa3cc 100644
--- a/aom_ports/aom_ports.cmake
+++ b/aom_ports/aom_ports.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/aom_ports/float.asm b/aom_ports/float.asm
index abff60a7a4..92172c52c9 100644
--- a/aom_ports/float.asm
+++ b/aom_ports/float.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_ports/x86_abi_support.asm b/aom_ports/x86_abi_support.asm
index f1a65f53e5..a65e760a80 100644
--- a/aom_ports/x86_abi_support.asm
+++ b/aom_ports/x86_abi_support.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_scale/aom_scale.cmake b/aom_scale/aom_scale.cmake
index ea94dbc063..3fe7fb752a 100644
--- a/aom_scale/aom_scale.cmake
+++ b/aom_scale/aom_scale.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index 0d545c2f3c..2988383c8d 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl
@@ -1,5 +1,5 @@
 ##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/aom_util/aom_util.cmake b/aom_util/aom_util.cmake
index d3da550485..89abf9de36 100644
--- a/aom_util/aom_util.cmake
+++ b/aom_util/aom_util.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 99ce3fb68b..4ca5007c12 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 4700098fe1..b982d86bcc 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -1,5 +1,5 @@
 ##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/common/reconinter_template.inc b/av1/common/reconinter_template.inc
index 863c13c112..2a6161a366 100644
--- a/av1/common/reconinter_template.inc
+++ b/av1/common/reconinter_template.inc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
index ad4ae274e2..5fda639ef7 100644
--- a/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
+++ b/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index 618758105a..357da56c97 100644
--- a/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/dct_sse2.asm b/av1/encoder/x86/dct_sse2.asm
index b185548184..60504e4ed8 100644
--- a/av1/encoder/x86/dct_sse2.asm
+++ b/av1/encoder/x86/dct_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/av1/encoder/x86/error_sse2.asm b/av1/encoder/x86/error_sse2.asm
index 6407c106ab..d52c437a1a 100644
--- a/av1/encoder/x86/error_sse2.asm
+++ b/av1/encoder/x86/error_sse2.asm
@@ -1,5 +1,5 @@
 ;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/build/cmake/aom_config.c.template b/build/cmake/aom_config.c.template
index 93a6d8f1ad..1fad131b67 100644
--- a/build/cmake/aom_config.c.template
+++ b/build/cmake/aom_config.c.template
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) @year@, Alliance for Open Media. All rights reserved
+ * Copyright (c) @year@, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 980dfb9327..cb478d0b83 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index ac3e1325b3..20859469c7 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index 3bbeb0c874..cc29c09c54 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/aom_install.cmake b/build/cmake/aom_install.cmake
index 2c263e96b9..d16df9766e 100644
--- a/build/cmake/aom_install.cmake
+++ b/build/cmake/aom_install.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/aom_optimization.cmake b/build/cmake/aom_optimization.cmake
index 0f93228eef..9cc34de362 100644
--- a/build/cmake/aom_optimization.cmake
+++ b/build/cmake/aom_optimization.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/compiler_flags.cmake b/build/cmake/compiler_flags.cmake
index 3afcd50b5c..dc3902a6da 100644
--- a/build/cmake/compiler_flags.cmake
+++ b/build/cmake/compiler_flags.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/compiler_tests.cmake b/build/cmake/compiler_tests.cmake
index 0402832253..76a2445aba 100644
--- a/build/cmake/compiler_tests.cmake
+++ b/build/cmake/compiler_tests.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 6e6fdb8e24..acebe202fd 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/dist.cmake b/build/cmake/dist.cmake
index 5b9fc95d41..24db93e29d 100644
--- a/build/cmake/dist.cmake
+++ b/build/cmake/dist.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/exports.cmake b/build/cmake/exports.cmake
index 1cea2b52ab..2ce4601ba4 100644
--- a/build/cmake/exports.cmake
+++ b/build/cmake/exports.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/exports_sources.cmake b/build/cmake/exports_sources.cmake
index 46bf001d86..2387e341b9 100644
--- a/build/cmake/exports_sources.cmake
+++ b/build/cmake/exports_sources.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/generate_aom_config_templates.cmake b/build/cmake/generate_aom_config_templates.cmake
index 529daaf02a..c27e2f27aa 100644
--- a/build/cmake/generate_aom_config_templates.cmake
+++ b/build/cmake/generate_aom_config_templates.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
@@ -12,7 +12,7 @@ cmake_minimum_required(VERSION 3.5)
 
 string(TIMESTAMP year "%Y")
 set(asm_file_header_block "\;
-\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved.
 \;
 \; This source code is subject to the terms of the BSD 2 Clause License and
 \; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -23,7 +23,7 @@ set(asm_file_header_block "\;
 \;
 ")
 set(h_file_header_block "/*
- * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -36,7 +36,7 @@ set(h_file_header_block "/*
 \#define AOM_CONFIG_H_
 ")
 set(cmake_file_header_block "##
-## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+## Copyright (c) ${year}, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/build/cmake/generate_exports.cmake b/build/cmake/generate_exports.cmake
index 3a5f67cea6..10a6a8fbe6 100644
--- a/build/cmake/generate_exports.cmake
+++ b/build/cmake/generate_exports.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/pkg_config.cmake b/build/cmake/pkg_config.cmake
index c4f94808a5..424b91119d 100644
--- a/build/cmake/pkg_config.cmake
+++ b/build/cmake/pkg_config.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index f4a70842d0..ac960960dc 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl
 ##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/build/cmake/sanitizers.cmake b/build/cmake/sanitizers.cmake
index bcb600ce4c..c1498229bd 100644
--- a/build/cmake/sanitizers.cmake
+++ b/build/cmake/sanitizers.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/android.cmake b/build/cmake/toolchains/android.cmake
index fb086856a7..67e0dbb046 100644
--- a/build/cmake/toolchains/android.cmake
+++ b/build/cmake/toolchains/android.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm-ios-common.cmake b/build/cmake/toolchains/arm-ios-common.cmake
index 2c433befd9..f7069bc307 100644
--- a/build/cmake/toolchains/arm-ios-common.cmake
+++ b/build/cmake/toolchains/arm-ios-common.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm64-ios.cmake b/build/cmake/toolchains/arm64-ios.cmake
index 6feb1090f2..fc07640302 100644
--- a/build/cmake/toolchains/arm64-ios.cmake
+++ b/build/cmake/toolchains/arm64-ios.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm64-linux-clang.cmake b/build/cmake/toolchains/arm64-linux-clang.cmake
index b4645cc09e..fa4aa9ddb6 100644
--- a/build/cmake/toolchains/arm64-linux-clang.cmake
+++ b/build/cmake/toolchains/arm64-linux-clang.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm64-linux-gcc.cmake b/build/cmake/toolchains/arm64-linux-gcc.cmake
index 3d0dff0252..9fc17fa53f 100644
--- a/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm64-macos.cmake b/build/cmake/toolchains/arm64-macos.cmake
index 99f8d16e16..7d4d277bd4 100644
--- a/build/cmake/toolchains/arm64-macos.cmake
+++ b/build/cmake/toolchains/arm64-macos.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/arm64-mingw-gcc.cmake b/build/cmake/toolchains/arm64-mingw-gcc.cmake
index 95b26d3ceb..4dd81be62e 100644
--- a/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/armv7-ios.cmake b/build/cmake/toolchains/armv7-ios.cmake
index 11f7e160df..9336b74f49 100644
--- a/build/cmake/toolchains/armv7-ios.cmake
+++ b/build/cmake/toolchains/armv7-ios.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/armv7-linux-gcc.cmake b/build/cmake/toolchains/armv7-linux-gcc.cmake
index aa0550574d..2935d10710 100644
--- a/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/armv7-mingw-gcc.cmake b/build/cmake/toolchains/armv7-mingw-gcc.cmake
index 93f8c065c9..821d4f89fa 100644
--- a/build/cmake/toolchains/armv7-mingw-gcc.cmake
+++ b/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/armv7s-ios.cmake b/build/cmake/toolchains/armv7s-ios.cmake
index faa2933cf0..e443ec3d21 100644
--- a/build/cmake/toolchains/armv7s-ios.cmake
+++ b/build/cmake/toolchains/armv7s-ios.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/i686-linux-gcc.cmake b/build/cmake/toolchains/i686-linux-gcc.cmake
index c4f6ab9465..7f2fa596f9 100644
--- a/build/cmake/toolchains/i686-linux-gcc.cmake
+++ b/build/cmake/toolchains/i686-linux-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/ios-simulator-common.cmake b/build/cmake/toolchains/ios-simulator-common.cmake
index 173c423c3d..1aa76634ed 100644
--- a/build/cmake/toolchains/ios-simulator-common.cmake
+++ b/build/cmake/toolchains/ios-simulator-common.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/ppc-linux-gcc.cmake b/build/cmake/toolchains/ppc-linux-gcc.cmake
index 3aa265254e..0c80c0afa4 100644
--- a/build/cmake/toolchains/ppc-linux-gcc.cmake
+++ b/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/riscv-linux-gcc.cmake b/build/cmake/toolchains/riscv-linux-gcc.cmake
index 4133be68b3..ea4549e4dc 100644
--- a/build/cmake/toolchains/riscv-linux-gcc.cmake
+++ b/build/cmake/toolchains/riscv-linux-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86-ios-simulator.cmake b/build/cmake/toolchains/x86-ios-simulator.cmake
index caacb8c38b..78c0e1e49c 100644
--- a/build/cmake/toolchains/x86-ios-simulator.cmake
+++ b/build/cmake/toolchains/x86-ios-simulator.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86-linux.cmake b/build/cmake/toolchains/x86-linux.cmake
index a9c4f8c6b4..4ff0ee2eb6 100644
--- a/build/cmake/toolchains/x86-linux.cmake
+++ b/build/cmake/toolchains/x86-linux.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86-macos.cmake b/build/cmake/toolchains/x86-macos.cmake
index 68e1bb07ff..243ea7b94e 100644
--- a/build/cmake/toolchains/x86-macos.cmake
+++ b/build/cmake/toolchains/x86-macos.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86-mingw-gcc.cmake b/build/cmake/toolchains/x86-mingw-gcc.cmake
index 2208333f37..b6c2af3431 100644
--- a/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86_64-ios-simulator.cmake b/build/cmake/toolchains/x86_64-ios-simulator.cmake
index d4b40ed098..ed0ebd12be 100644
--- a/build/cmake/toolchains/x86_64-ios-simulator.cmake
+++ b/build/cmake/toolchains/x86_64-ios-simulator.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86_64-macos.cmake b/build/cmake/toolchains/x86_64-macos.cmake
index 899df6f353..583eddcf53 100644
--- a/build/cmake/toolchains/x86_64-macos.cmake
+++ b/build/cmake/toolchains/x86_64-macos.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/build/cmake/toolchains/x86_64-mingw-gcc.cmake
index 978146a4f2..d041f4cac0 100644
--- a/build/cmake/toolchains/x86_64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/util.cmake b/build/cmake/util.cmake
index 31de2e1702..3e87c1b7a8 100644
--- a/build/cmake/util.cmake
+++ b/build/cmake/util.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/version.cmake b/build/cmake/version.cmake
index f4377a13e1..24fbf9c337 100644
--- a/build/cmake/version.cmake
+++ b/build/cmake/version.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/build/cmake/version.pl b/build/cmake/version.pl
index 392815f81d..400107ce80 100755
--- a/build/cmake/version.pl
+++ b/build/cmake/version.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -74,7 +74,7 @@ my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_P
 my $year = (localtime)[5] + 1900;
 my $lic_block = << "EOF";
 /*
- * Copyright (c) $year, Alliance for Open Media. All rights reserved
+ * Copyright (c) $year, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/common/ivf_dec.cmake b/common/ivf_dec.cmake
index fedeea7940..0714af6047 100644
--- a/common/ivf_dec.cmake
+++ b/common/ivf_dec.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/docs.cmake b/docs.cmake
index 0d7b4cfde3..901e8c4a0a 100644
--- a/docs.cmake
+++ b/docs.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/examples/build_av1_dec_fuzzer.sh b/examples/build_av1_dec_fuzzer.sh
index 40355ea133..d2f52cc1cb 100755
--- a/examples/build_av1_dec_fuzzer.sh
+++ b/examples/build_av1_dec_fuzzer.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/libs.doxy_template b/libs.doxy_template
index 01da81ac0c..34216b8535 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -1,4 +1,4 @@
-## Copyright (c) 2020, Alliance for Open Media. All rights reserved
+## Copyright (c) 2020, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aomcx_set_ref.sh b/test/aomcx_set_ref.sh
index 237e2f319c..be02e383fe 100755
--- a/test/aomcx_set_ref.sh
+++ b/test/aomcx_set_ref.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aomdec.sh b/test/aomdec.sh
index e9738a8e89..203dda22e3 100755
--- a/test/aomdec.sh
+++ b/test/aomdec.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/aomenc.sh b/test/aomenc.sh
index 0bb9fba3b8..0f422d9bb3 100755
--- a/test/aomenc.sh
+++ b/test/aomenc.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/av1_c_vs_simd_encode.sh b/test/av1_c_vs_simd_encode.sh
index 897ac081c1..a55db5e3a4 100755
--- a/test/av1_c_vs_simd_encode.sh
+++ b/test/av1_c_vs_simd_encode.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/best_encode.sh b/test/best_encode.sh
index d29fdaed52..5e42544354 100755
--- a/test/best_encode.sh
+++ b/test/best_encode.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 214755f216..316d33e4ae 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/decode_with_drops.sh b/test/decode_with_drops.sh
index 1fc13ced35..1aad1c9ad7 100755
--- a/test/decode_with_drops.sh
+++ b/test/decode_with_drops.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/dump_obu.sh b/test/dump_obu.sh
index 933db64a6a..7ebd086235 100755
--- a/test/dump_obu.sh
+++ b/test/dump_obu.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/examples.sh b/test/examples.sh
index 3e1612303c..6869d025ca 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/gviz_api.py b/test/gviz_api.py
index d3a443dabf..8957b8c850 100755
--- a/test/gviz_api.py
+++ b/test/gviz_api.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/lightfield_test.sh b/test/lightfield_test.sh
index cf1ea73a84..1c42b6bc7a 100755
--- a/test/lightfield_test.sh
+++ b/test/lightfield_test.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/set_maps.sh b/test/set_maps.sh
index b79357a2b8..18810ec41e 100755
--- a/test/set_maps.sh
+++ b/test/set_maps.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simple_decoder.sh b/test/simple_decoder.sh
index 9b1aea1ed5..db23ab82a1 100755
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index dfb1a1b546..48f0f2fdfb 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/svc_encoder_rtc.sh b/test/svc_encoder_rtc.sh
index 735166d6f6..d1a76f2e9e 100644
--- a/test/svc_encoder_rtc.sh
+++ b/test/svc_encoder_rtc.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/test.cmake b/test/test.cmake
index 02e85f82fc..8090ad396b 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/test/test_data_download_worker.cmake b/test/test_data_download_worker.cmake
index a49038888d..3c644d0c3e 100644
--- a/test/test_data_download_worker.cmake
+++ b/test/test_data_download_worker.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index 069e1ad526..2a99a56e14 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/test/test_runner.cmake b/test/test_runner.cmake
index f0648d16be..b0fad4e668 100644
--- a/test/test_runner.cmake
+++ b/test/test_runner.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and the
 # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
diff --git a/test/tools_common.sh b/test/tools_common.sh
index cb9eba1727..79270ecaa1 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 44e7327b8f..4daf15ddab 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/test/visual_metrics.py b/test/visual_metrics.py
index 9055feb334..c57d88ed82 100755
--- a/test/visual_metrics.py
+++ b/test/visual_metrics.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/aggregate_entropy_stats.py b/tools/aggregate_entropy_stats.py
index 0311681f2d..3352157759 100644
--- a/tools/aggregate_entropy_stats.py
+++ b/tools/aggregate_entropy_stats.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/auto_refactor.py b/tools/auto_refactor/auto_refactor.py
index dd0d4415f9..3261efc990 100644
--- a/tools/auto_refactor/auto_refactor.py
+++ b/tools/auto_refactor/auto_refactor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/av1_preprocess.py b/tools/auto_refactor/av1_preprocess.py
index ea76912cf1..5559c1a489 100644
--- a/tools/auto_refactor/av1_preprocess.py
+++ b/tools/auto_refactor/av1_preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/auto_refactor/test_auto_refactor.py b/tools/auto_refactor/test_auto_refactor.py
index 6b1e269efa..d6ee9ac4b1 100644
--- a/tools/auto_refactor/test_auto_refactor.py
+++ b/tools/auto_refactor/test_auto_refactor.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved.
 #
 # This source code is subject to the terms of the BSD 2 Clause License and
 # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/diff.py b/tools/diff.py
index 7bb6b7fcb4..5fc9d8d071 100644
--- a/tools/diff.py
+++ b/tools/diff.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/gen_constrained_tokenset.py b/tools/gen_constrained_tokenset.py
index f5b0816dbf..f093b895af 100755
--- a/tools/gen_constrained_tokenset.py
+++ b/tools/gen_constrained_tokenset.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/intersect-diffs.py b/tools/intersect-diffs.py
index 960183675d..5062ca289c 100755
--- a/tools/intersect-diffs.py
+++ b/tools/intersect-diffs.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/lint-hunks.py b/tools/lint-hunks.py
index 8b3af972fc..ac90b7a509 100755
--- a/tools/lint-hunks.py
+++ b/tools/lint-hunks.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py b/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
index 9afb78cbf5..b8dd910391 100644
--- a/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
+++ b/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 ##
-## Copyright (c) 2022, Alliance for Open Media. All rights reserved
+## Copyright (c) 2022, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
diff --git a/tools/wrap-commit-msg.py b/tools/wrap-commit-msg.py
index c51ed093d3..2e7efbe30d 100755
--- a/tools/wrap-commit-msg.py
+++ b/tools/wrap-commit-msg.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 ##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-- 
GitLab


From 76b95d70f8e3eaf3324ff5cf6ca5c86ba18dddd8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 12 Jun 2024 16:57:27 -0700
Subject: [PATCH 221/391] README.md: add example Copyright comment block

Bug: aomedia:3525
Change-Id: I0ed14989717545d9b84fe3cd300d7d29bacfbef2
---
 README.md | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f81e13e9bd..e6e822b8fb 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,8 @@ README.md                {#LREADME}
         - [Running tests directly](#running-test_libaom-directly)
         - [Running tests via CMake](#running-the-tests-via-the-cmake-build)
 3. [Coding style](#coding-style)
-4. [Submitting patches](#submitting-patches)
+4. [License header](#license-header)
+5. [Submitting patches](#submitting-patches)
     - [Login cookie](#login-cookie)
     - [Contributor agreement](#contributor-agreement)
     - [Testing your code](#testing-your-code)
@@ -39,8 +40,8 @@ README.md                {#LREADME}
     - [Incorporating Reviewer Comments](#incorporating-reviewer-comments)
     - [Submitting your change](#submitting-your-change)
     - [Viewing change status](#viewing-the-status-of-uploaded-changes)
-5. [Support](#support)
-6. [Bug reports](#bug-reports)
+6. [Support](#support)
+7. [Bug reports](#bug-reports)
 
 ## Building the library and applications {#building-the-library-and-applications}
 
@@ -567,6 +568,25 @@ Some Git installations have clang-format integration. Here are some examples:
     $ git clang-format -f -p
 ~~~
 
+## License header {#license-header}
+
+Use the following comment block in new C/C++ source files, replacing "${year}"
+with the current year. The same comment should be added to other file types,
+adjusting the comment syntax as necessary.
+
+```
+/*
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+```
+
 ## Submitting patches {#submitting-patches}
 
 We manage the submission of patches using the
-- 
GitLab


From 817973695d9f4ddb757a21c6a8aebacd7eb57d77 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 13 Jun 2024 11:51:43 -0700
Subject: [PATCH 222/391] rtcd.pl: add license header to generated files

Bug: aomedia:3525
Change-Id: I614056558fb5439b448342e0c01e53bd8da85585
---
 build/cmake/rtcd.pl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index ac960960dc..dc827c2deb 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -219,7 +219,20 @@ sub filter {
 #
 sub common_top() {
   my $include_guard = uc($opts{sym})."_H_";
+  my @time = localtime;
+  my $year = $time[5] + 1900;
   print <<EOF;
+/*
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 // This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
-- 
GitLab


From 14190db24892bf457e21b846efb74351a28bbbf1 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 13 Jun 2024 11:55:05 -0700
Subject: [PATCH 223/391] Avoid the notion of layer-specific OBUs

The notion of layer-specific OBUs was introduced in a proposed change to
the AV1 spec: https://github.com/AOMediaCodec/av1-spec/commit/86fb0ac.
Avoid mentioning a term that is not defined in the current version of
the AV1 spec. (I also forgot to define the term in commit 782840b.)

Remove a confusing assertion in init_large_scale_tile_obu_header() that
was added in commit 782840b. I will fix the use of obu_extension=0 (the
reason for adding that assertion) in the function in a separate CL.

Bug: aomedia:3076, aomedia:3582
Change-Id: Ifbf534e4d2b57c18d46a955dc5b28e3e8e0a3f31
---
 av1/av1_cx_iface.c      |  1 -
 av1/encoder/bitstream.c | 43 ++++++++++++++++++++++++-----------------
 av1/encoder/bitstream.h |  3 +--
 av1/encoder/encoder.c   |  3 +--
 4 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 6205257ad8..f6ef0ca4e3 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -3356,7 +3356,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         obu_header_size = av1_write_obu_header(
             &ppi->level_params, &cpi->frame_header_count,
             OBU_TEMPORAL_DELIMITER,
-            /*is_layer_specific_obu=*/false,
             ppi->seq_params.has_nonzero_operating_point_idc, 0, ctx->cx_data);
 
         // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 4f5ec12d7d..7c02eeb429 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3356,10 +3356,8 @@ static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
 
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
                               int *frame_header_count, OBU_TYPE obu_type,
-                              bool is_layer_specific_obu,
                               bool has_nonzero_operating_point_idc,
                               int obu_extension, uint8_t *const dst) {
-  assert(IMPLIES(!is_layer_specific_obu, obu_extension == 0));
   assert(IMPLIES(!has_nonzero_operating_point_idc, obu_extension == 0));
 
   if (level_params->keep_level_stats &&
@@ -3368,8 +3366,30 @@ uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
-  int obu_extension_flag =
-      has_nonzero_operating_point_idc && is_layer_specific_obu;
+
+  // The AV1 spec Version 1.0.0 with Errata 1 has the following requirements on
+  // the OBU extension header:
+  //
+  // 6.4.1. General sequence header OBU semantics:
+  //   It is a requirement of bitstream conformance that if OperatingPointIdc
+  //   is equal to 0, then obu_extension_flag is equal to 0 for all OBUs that
+  //   follow this sequence header until the next sequence header.
+  //
+  // 7.5. Ordering of OBUs:
+  //   If a coded video sequence contains at least one enhancement layer (OBUs
+  //   with spatial_id greater than 0 or temporal_id greater than 0) then all
+  //   frame headers and tile group OBUs associated with base (spatial_id
+  //   equals 0 and temporal_id equals 0) and enhancement layer (spatial_id
+  //   greater than 0 or temporal_id greater than 0) data must include the OBU
+  //   extension header.
+  //
+  // Set obu_extension_flag to satisfy these requirements.
+  int obu_extension_flag = 0;
+  if (has_nonzero_operating_point_idc) {
+    obu_extension_flag =
+        (obu_type == OBU_FRAME_HEADER || obu_type == OBU_TILE_GROUP ||
+         obu_type == OBU_FRAME || obu_type == OBU_REDUNDANT_FRAME_HEADER);
+  }
 
   aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
   aom_wb_write_literal(&wb, (int)obu_type, 4);
@@ -3544,14 +3564,9 @@ static uint32_t init_large_scale_tile_obu_header(
   // For large_scale_tile case, we always have only one tile group, so it can
   // be written as an OBU_FRAME.
   const OBU_TYPE obu_type = OBU_FRAME;
-  // We pass obu_extension=0 to av1_write_obu_header(), so
-  // has_nonzero_operating_point_idc must be false.
-  assert(!cpi->common.seq_params->has_nonzero_operating_point_idc);
   lst_obu->tg_hdr_size = av1_write_obu_header(
       level_params, &cpi->frame_header_count, obu_type,
-      /*is_layer_specific_obu=*/true,
-      cpi->common.seq_params->has_nonzero_operating_point_idc,
-      /*obu_extension=*/0, *data);
+      cpi->common.seq_params->has_nonzero_operating_point_idc, 0, *data);
   *data += lst_obu->tg_hdr_size;
 
   const uint32_t frame_header_size =
@@ -3749,7 +3764,6 @@ void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
   const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
   *curr_tg_hdr_size = av1_write_obu_header(
       &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
-      /*is_layer_specific_obu=*/true,
       cm->seq_params->has_nonzero_operating_point_idc,
       pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
   pack_bs_params->obu_header_size = *curr_tg_hdr_size;
@@ -3852,7 +3866,6 @@ void av1_write_last_tile_info(
     av1_write_obu_header(
         &cpi->ppi->level_params, &cpi->frame_header_count,
         OBU_REDUNDANT_FRAME_HEADER,
-        /*is_layer_specific_obu=*/true,
         cpi->common.seq_params->has_nonzero_operating_point_idc,
         obu_extn_header, &curr_tg_start[fh_info->obu_header_byte_offset]);
 
@@ -4153,12 +4166,8 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
           (cm->current_frame.frame_type != KEY_FRAME &&
            current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
           current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
-        // Whether METADATA_TYPE_ITUT_T35 is layer-specific or not is
-        // payload-specific. Other metadata types are not layer-specific.
-        const bool is_layer_specific_obu = false;
         obu_header_size = av1_write_obu_header(
             &cpi->ppi->level_params, &cpi->frame_header_count, OBU_METADATA,
-            is_layer_specific_obu,
             cm->seq_params->has_nonzero_operating_point_idc, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
@@ -4209,7 +4218,6 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
       cm->current_frame.frame_type == KEY_FRAME) {
     obu_header_size = av1_write_obu_header(
         level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER,
-        /*is_layer_specific_obu=*/false,
         cm->seq_params->has_nonzero_operating_point_idc, 0, data);
     obu_payload_size =
         av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
@@ -4235,7 +4243,6 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
     fh_info.frame_header = data;
     obu_header_size = av1_write_obu_header(
         level_params, &cpi->frame_header_count, OBU_FRAME_HEADER,
-        /*is_layer_specific_obu=*/true,
         cm->seq_params->has_nonzero_operating_point_idc, obu_extension_header,
         data);
     obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index 232c43040e..a8f3cc541c 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -91,11 +91,10 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
                                        uint8_t *const dst);
 
 // Writes the OBU header byte, and the OBU header extension byte when
-// has_nonzero_operating_point_idc is true and the OBU is layer-specific.
+// has_nonzero_operating_point_idc is true and the OBU is part of a frame.
 // Returns number of bytes written to 'dst'.
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
                               int *frame_header_count, OBU_TYPE obu_type,
-                              bool is_layer_specific_obu,
                               bool has_nonzero_operating_point_idc,
                               int obu_extension, uint8_t *const dst);
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 1a132fe88e..fd293b1940 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -558,7 +558,6 @@ void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
     int i = 0;
     assert(seq->operating_points_cnt_minus_1 ==
            (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
-    seq->has_nonzero_operating_point_idc = true;
     for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
       for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
         seq->operating_point_idc[i] =
@@ -568,6 +567,7 @@ void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
         i++;
       }
     }
+    seq->has_nonzero_operating_point_idc = true;
   }
 }
 
@@ -5393,7 +5393,6 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
 
   if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
                            OBU_SEQUENCE_HEADER,
-                           /*is_layer_specific_obu=*/false,
                            ppi->seq_params.has_nonzero_operating_point_idc, 0,
                            &header_buf[0]) != obu_header_size) {
     return NULL;
-- 
GitLab


From 49c02efb61e1aaf2108c704c7f542d0343069f16 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 13 Jun 2024 15:18:39 -0700
Subject: [PATCH 224/391] Pass obu ext to init_large_scale_tile_obu_header

Pass obu_extension_header to init_large_scale_tile_obu_header(), so that
it can pass the correct obu_extension argument (instead of the
placeholder value 0) to av1_write_obu_header().

Change-Id: I1d70b3bafa9264c02d3d8818b6f60d3aae4c639e
---
 av1/encoder/bitstream.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 7c02eeb429..a2497571ee 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3558,7 +3558,7 @@ typedef struct {
 // Initialize OBU header for large scale tile case.
 static uint32_t init_large_scale_tile_obu_header(
     AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
-    LargeTileFrameOBU *lst_obu) {
+    uint8_t obu_extension_header, LargeTileFrameOBU *lst_obu) {
   AV1LevelParams *const level_params = &cpi->ppi->level_params;
   CurrentFrame *const current_frame = &cpi->common.current_frame;
   // For large_scale_tile case, we always have only one tile group, so it can
@@ -3566,7 +3566,8 @@ static uint32_t init_large_scale_tile_obu_header(
   const OBU_TYPE obu_type = OBU_FRAME;
   lst_obu->tg_hdr_size = av1_write_obu_header(
       level_params, &cpi->frame_header_count, obu_type,
-      cpi->common.seq_params->has_nonzero_operating_point_idc, 0, *data);
+      cpi->common.seq_params->has_nonzero_operating_point_idc,
+      obu_extension_header, *data);
   *data += lst_obu->tg_hdr_size;
 
   const uint32_t frame_header_size =
@@ -3724,7 +3725,8 @@ static void write_large_scale_tile_obu(
 // Packs information in the obu header for large scale tiles.
 static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
     AV1_COMP *const cpi, uint8_t *const dst,
-    struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) {
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+    int *const largest_tile_id) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonTileParams *const tiles = &cm->tiles;
   uint32_t total_size = 0;
@@ -3735,8 +3737,8 @@ static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
 
   LargeTileFrameOBU lst_obu;
 
-  total_size +=
-      init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu);
+  total_size += init_large_scale_tile_obu_header(
+      cpi, &data, saved_wb, obu_extension_header, &lst_obu);
 
   write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size,
                              have_tiles, &max_tile_size, &max_tile_col_size);
@@ -4128,8 +4130,8 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   }
 
   if (tiles->large_scale)
-    return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
-                                             largest_tile_id);
+    return pack_large_scale_tiles_in_tg_obus(
+        cpi, dst, saved_wb, obu_extension_header, largest_tile_id);
 
   return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header,
                                fh_info, largest_tile_id);
-- 
GitLab


From 2a16696ea0f1640097d0d27b75322fd39ac808f9 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 4 Jun 2024 14:45:27 -0700
Subject: [PATCH 225/391] rtc: Speedup for dynamic screen content

For rtc screen content mode, speed >= 11:
Detect if the content has high motion, from
source_sad and fast/coarse ME analysis in the
scene detection. If detected, adjust some speed
features and increase the base partition threshold
to make the encoder faster. Coarse ME is done on center
superblock to avoid setting high_motion flag for scroll.

This also contains a fix to bsize_select for the
fixed partitioning.

This has small effect on most clips in the rtc_screen set,
except for very high motion content where it reduces
instruction count by ~50-60%. psnr loss ~0.4dB on high
motion clip.

Change-Id: I68071da2b40731cc81ac15a8010976e6ef234776
---
 av1/encoder/aq_cyclicrefresh.c |   3 +-
 av1/encoder/encodeframe.c      |   2 +-
 av1/encoder/mcomp.c            |  12 ++--
 av1/encoder/mcomp.h            |   3 +
 av1/encoder/ratectrl.c         | 118 +++++++++++++++++++++++++++++++++
 av1/encoder/ratectrl.h         |   1 +
 av1/encoder/speed_features.c   |  16 +++--
 av1/encoder/var_based_part.c   |  35 +++-------
 8 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index e9fd771071..4d8be31120 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -439,7 +439,8 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
-      scene_change_detected || svc->temporal_layer_id > 0 ||
+      cpi->rc.high_motion_screen_content || scene_change_detected ||
+      svc->temporal_layer_id > 0 ||
       svc->prev_number_spatial_layers != svc->number_spatial_layers ||
       p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (svc->number_spatial_layers > 1 &&
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 788ac80826..cac8d81516 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -545,7 +545,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
     BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
     if (sf->rt_sf.use_fast_fixed_part &&
         x->content_state_sb.source_sad_nonrd < kLowSad) {
-      bsize_select = BLOCK_64X64;
+      bsize_select = cm->seq_params->sb_size;
     }
     const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cf44db760c..94fd17e4fa 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1993,8 +1993,8 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
   return best_hash_cost;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
-                        int full_search, int *sad) {
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad) {
   int best_sad = INT_MAX;
   int this_sad;
   int d;
@@ -2174,11 +2174,11 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
-      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
-                   full_search, &best_sad_col);
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
   best_int_mv->as_fullmv.row =
-      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
-                   full_search, &best_sad_row);
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
 
   // For screen: select between horiz or vert motion.
   if (is_screen) {
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index d6dc8cba29..7dd32e0a71 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -262,6 +262,9 @@ void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
 
 int av1_init_search_range(int size);
 
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad);
+
 unsigned int av1_int_pro_motion_estimation(
     const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index ec9ae10569..a34ce78757 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -33,6 +33,7 @@
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/mcomp.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -3017,6 +3018,80 @@ static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols,
   return 0;
 }
 
+// Returns the best sad for column or row motion of the superblock.
+static unsigned int estimate_scroll_motion(
+    const AV1_COMP *cpi, uint8_t *src_buf, uint8_t *last_src_buf,
+    int src_stride, int ref_stride, BLOCK_SIZE bsize, int pos_col, int pos_row,
+    int *best_intmv_col, int *best_intmv_row) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int full_search = 1;
+  // Keep border a multiple of 16.
+  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+  // Make search_size_height larger to capture more common vertical scroll.
+  // Increase the search if last two frames were dropped.
+  // Values set based on screen test set.
+  int search_size_width = 96;
+  int search_size_height = cpi->rc.drop_count_consec > 1 ? 224 : 192;
+  // Adjust based on boundary.
+  if ((pos_col - search_size_width < -border) ||
+      (pos_col + search_size_width > cm->width + border))
+    search_size_width = border;
+  if ((pos_row - search_size_height < -border) ||
+      (pos_row + search_size_height > cm->height + border))
+    search_size_height = border;
+  const uint8_t *ref_buf;
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
+  const int ref_buf_width = (search_size_width << 1) + bw;
+  const int ref_buf_height = (search_size_height << 1) + bh;
+  int16_t *hbuf = (int16_t *)aom_malloc(ref_buf_width * sizeof(*hbuf));
+  int16_t *vbuf = (int16_t *)aom_malloc(ref_buf_height * sizeof(*vbuf));
+  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+    aom_free(hbuf);
+    aom_free(vbuf);
+    aom_free(src_hbuf);
+    aom_free(src_vbuf);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+  }
+  // Set up prediction 1-D reference set for rows.
+  ref_buf = last_src_buf - search_size_width;
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, ref_buf_width, bh,
+                  row_norm_factor);
+  // Set up prediction 1-D reference set for cols
+  ref_buf = last_src_buf - search_size_height * ref_stride;
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, ref_buf_height,
+                  col_norm_factor);
+  // Set up src 1-D reference set
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+  unsigned int best_sad;
+  int best_sad_col, best_sad_row;
+  // Find the best match per 1-D search
+  *best_intmv_col =
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
+  *best_intmv_row =
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
+  if (best_sad_col < best_sad_row) {
+    *best_intmv_row = 0;
+    best_sad = best_sad_col;
+  } else {
+    *best_intmv_col = 0;
+    best_sad = best_sad_row;
+  }
+  aom_free(hbuf);
+  aom_free(vbuf);
+  aom_free(src_hbuf);
+  aom_free(src_vbuf);
+  return best_sad;
+}
+
 /*!\brief Check for scene detection, for 1 pass real-time mode.
  *
  * Compute average source sad (temporal sad: between current source and
@@ -3184,6 +3259,49 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   if (num_samples > 0)
     rc->percent_blocks_with_motion =
         ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  // Update the high_motion_screen_content flag on TL0. Avoid the update
+  // if too many consecutive frame drops occurred.
+  const uint64_t thresh_high_motion = 9 * 64 * 64;
+  if (cpi->svc.temporal_layer_id == 0 && rc->drop_count_consec < 3) {
+    cpi->rc.high_motion_screen_content = 0;
+    if (cpi->oxcf.speed >= 11 &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        rc->percent_blocks_with_motion > 40 &&
+        rc->prev_avg_source_sad > thresh_high_motion &&
+        rc->avg_source_sad > thresh_high_motion &&
+        rc->avg_frame_low_motion < 60 && unscaled_src->y_width >= 1280 &&
+        unscaled_src->y_height >= 720) {
+      cpi->rc.high_motion_screen_content = 1;
+      // Compute fast coarse/global motion for 128x128 superblock centered
+      // at middle of frames, to determine if motion is scroll.
+      int pos_col = (unscaled_src->y_width >> 1) - 64;
+      int pos_row = (unscaled_src->y_height >> 1) - 64;
+      src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col;
+      last_src_y =
+          unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col;
+      int best_intmv_col = 0;
+      int best_intmv_row = 0;
+      unsigned int y_sad = estimate_scroll_motion(
+          cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128,
+          pos_col, pos_row, &best_intmv_col, &best_intmv_row);
+      if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
+        cpi->rc.high_motion_screen_content = 0;
+    }
+    // Pass the flag value to all layer frames.
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1) {
+      SVC *svc = &cpi->svc;
+      for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (int tl = 1; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->high_motion_screen_content = rc->high_motion_screen_content;
+        }
+      }
+    }
+  }
   // Scene detection is only on base SLO, and using full/orignal resolution.
   // Pass the state to the upper spatial layers.
   if (cpi->svc.number_spatial_layers > 1) {
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 5fcb65e071..0a5cfbc178 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -190,6 +190,7 @@ typedef struct {
   int sframe_due;
 
   int high_source_sad;
+  int high_motion_screen_content;
   uint64_t avg_source_sad;
   uint64_t prev_avg_source_sad;
   uint64_t frame_source_sad;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 31fe03aebe..4b6ea16d7f 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1608,10 +1608,18 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
           sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
       }
     }
-    if (cpi->rc.max_block_source_sad > 20000 &&
-        cpi->rc.frame_source_sad > 100 && speed >= 6 &&
-        (cpi->rc.percent_blocks_with_motion > 1 ||
-         cpi->svc.last_layer_dropped[0])) {
+    if (speed >= 11 && cpi->rc.high_motion_screen_content) {
+      sf->rt_sf.higher_thresh_scene_detection = 1;
+      sf->rt_sf.force_only_last_ref = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->part_sf.fixed_partition_size = BLOCK_32X32;
+      sf->rt_sf.use_fast_fixed_part = 1;
+      sf->rt_sf.increase_source_sad_thresh = 1;
+      sf->rt_sf.selective_cdf_update = 1;
+    } else if (cpi->rc.max_block_source_sad > 20000 &&
+               cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+               (cpi->rc.percent_blocks_with_motion > 1 ||
+                cpi->svc.last_layer_dropped[0])) {
       sf->mv_sf.search_method = NSTEP;
       sf->rt_sf.fullpel_search_step_param = 2;
     }
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index e5908f41c2..0b449e89d5 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -627,14 +627,12 @@ static AOM_INLINE void tune_thresh_based_on_resolution(
   }
 }
 
-// Increase partition thresholds for noisy content. Apply it only for
-// superblocks where sumdiff is low, as we assume the sumdiff of superblock
-// whose only change is due to noise will be low (i.e, noise will average
-// out over large block).
-static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
-                                                    int64_t threshold_base,
-                                                    int content_lowsumdiff,
-                                                    int num_pixels) {
+// Increase the base partition threshold, based on content and noise level.
+static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
+                                                   int64_t threshold_base,
+                                                   int content_lowsumdiff,
+                                                   int source_sad_nonrd,
+                                                   int num_pixels) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t updated_thresh_base = threshold_base;
   if (cpi->noise_estimate.enabled && content_lowsumdiff &&
@@ -647,23 +645,12 @@ static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
              !cpi->sf.rt_sf.prefer_large_partition_blocks)
       updated_thresh_base = (5 * updated_thresh_base) >> 2;
   }
-  // TODO(kyslov) Enable var based partition adjusment on temporal denoising
-#if 0  // CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
-      updated_thresh_base =
-          av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
-                                content_state, cpi->svc.temporal_layer_id);
-  else
-    threshold_base =
-        scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
-                                  cm->height, cpi->ppi->rtc_ref.non_reference_frame);
-#else
-  // Increase base variance threshold based on content_state/sum_diff level.
   updated_thresh_base = scale_part_thresh_content(
       updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
       cpi->ppi->rtc_ref.non_reference_frame);
-#endif
+  if (cpi->oxcf.speed >= 11 && source_sad_nonrd > kLowSad &&
+      cpi->rc.high_motion_screen_content)
+    updated_thresh_base = updated_thresh_base << 5;
   return updated_thresh_base;
 }
 
@@ -686,8 +673,8 @@ static AOM_INLINE void set_vbp_thresholds(
     return;
   }
 
-  threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
-                                             content_lowsumdiff, num_pixels);
+  threshold_base = tune_base_thresh_content(
+      cpi, threshold_base, content_lowsumdiff, source_sad_nonrd, num_pixels);
   thresholds[0] = threshold_base >> 1;
   thresholds[1] = threshold_base;
   thresholds[3] = threshold_base << threshold_left_shift;
-- 
GitLab


From 0e4de1bf730d63ac4a563afbe8272f3241528dc7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 13:24:49 -0700
Subject: [PATCH 226/391] av1_quantize: use optimized aom_quantize_b_helper()

A NEON version was added in:
5951ee34e9 Add NEON version of few quantize functions

It is used indirectly through aom_quantize_b_32x32_neon() and
aom_quantize_b_64x64_neon() with NULL qm_ptr/iqm_ptr values.

Change-Id: Ie35e61a0ba8cd25460a834f5624de9a97398030b
---
 av1/encoder/av1_quantize.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 382d07c5b0..de17a28323 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -343,10 +343,10 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif  // !CONFIG_REALTIME_ONLY
 
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                            dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                            sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    aom_quantize_b_helper(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                          p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
     switch (qparam->log_scale) {
       case 0:
-- 
GitLab


From e5a842a27b171f590749debcfdbf60a3b00ee790 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 19:42:32 -0700
Subject: [PATCH 227/391] sad{,_av1}.c: exclude 1:4/4:1 fns
 w/CONFIG_REALTIME_ONLY

This quiets some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Ibbfa872fb39104c4fdd9bdec570401925201f54e
---
 aom_dsp/sad.c     | 2 +-
 aom_dsp/sad_av1.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 0e9d900e5e..72ed758370 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -365,6 +365,7 @@ HIGHBD_SADMXN(4, 4)
 HIGHBD_SAD_MXNX4D(4, 4)
 HIGHBD_SAD_MXNX3D(4, 4)
 
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_SADMXN(4, 16)
 HIGHBD_SAD_MXNX4D(4, 16)
 HIGHBD_SADMXN(16, 4)
@@ -378,7 +379,6 @@ HIGHBD_SAD_MXNX4D(16, 64)
 HIGHBD_SADMXN(64, 16)
 HIGHBD_SAD_MXNX4D(64, 16)
 
-#if !CONFIG_REALTIME_ONLY
 HIGHBD_SAD_MXNX3D(4, 16)
 HIGHBD_SAD_MXNX3D(16, 4)
 HIGHBD_SAD_MXNX3D(8, 32)
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index 43035d79bc..80d7c3fee4 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -83,12 +83,14 @@ MASKSADMxN(8, 8)
 MASKSADMxN(8, 4)
 MASKSADMxN(4, 8)
 MASKSADMxN(4, 4)
+#if !CONFIG_REALTIME_ONLY
 MASKSADMxN(4, 16)
 MASKSADMxN(16, 4)
 MASKSADMxN(8, 32)
 MASKSADMxN(32, 8)
 MASKSADMxN(16, 64)
 MASKSADMxN(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 /* clang-format on */
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -148,12 +150,14 @@ HIGHBD_MASKSADMXN(8, 8)
 HIGHBD_MASKSADMXN(8, 4)
 HIGHBD_MASKSADMXN(4, 8)
 HIGHBD_MASKSADMXN(4, 4)
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASKSADMXN(4, 16)
 HIGHBD_MASKSADMXN(16, 4)
 HIGHBD_MASKSADMXN(8, 32)
 HIGHBD_MASKSADMXN(32, 8)
 HIGHBD_MASKSADMXN(16, 64)
 HIGHBD_MASKSADMXN(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if !CONFIG_REALTIME_ONLY
-- 
GitLab


From dccfa9a47f9ade750adf3b5e6274074fec6f415e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 19:43:52 -0700
Subject: [PATCH 228/391] av1_fwd_txfm2d.c: exclude 1:4/4:1 fns
 w/CONFIG_REALTIME_ONLY

Except av1_fwd_txfm2d_16x4_c which is used by superres_scale.c.

Bug: aomedia:3416
Change-Id: Ifd1c5081042998885c3c3561c969c1f72fd3a26a
---
 av1/encoder/av1_fwd_txfm2d.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index 8012d771ca..625b6298e9 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c
@@ -174,6 +174,7 @@ void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
   fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
@@ -181,6 +182,7 @@ void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
   av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
   fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
@@ -190,6 +192,7 @@ void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
   fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
@@ -205,6 +208,7 @@ void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
   av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
   fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
                           TX_TYPE tx_type, int bd) {
@@ -284,6 +288,7 @@ void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
   // Note: no repacking needed here.
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
                             TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
@@ -310,6 +315,7 @@ void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
   // Note: no repacking needed here.
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
 static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
-- 
GitLab


From 20465886b01a04ddc5ac7fed4ac690b125c7838e Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 12 Jun 2024 15:18:16 -0700
Subject: [PATCH 229/391] rtc: Add logic for setting sb_size for MT

This was removed here, as fix to psnr regression:
https://aomedia-review.googlesource.com/c/aom/+/190624

But the underlying issue was resolved in:
https://aomedia-review.googlesource.com/c/aom/+/191005

So adding the logic back in (with small tuning), as we
measured ~10% speedup for 4 and 8 threads across some
~1080p screen clips.

Change-Id: I4e4ca9342544bfcc27635a0ac9569a26e81e9ff8
---
 av1/encoder/encoder_utils.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 2a549f8944..3419c09ca2 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -837,7 +837,21 @@ BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
                ? BLOCK_128X128
                : BLOCK_64X64;
   } else if (oxcf->mode == REALTIME) {
-    return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+    if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
+      const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+      const int num_tiles =
+          (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows);
+      // For multi-thread encode: if the number of (128x128) superblocks
+      // per tile is low use 64X64 superblock.
+      if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 &&
+          oxcf->max_threads >= num_tiles && AOMMIN(width, height) >= 720 &&
+          (width * height) / (128 * 128 * num_tiles) < 40)
+        return BLOCK_64X64;
+      else
+        return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
+    } else {
+      return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+    }
   }
 
   // TODO(any): Possibly could improve this with a heuristic.
-- 
GitLab


From 9db8f4c5c011199a7a0c2c01484c0c86e8c7363e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 16:26:13 -0700
Subject: [PATCH 230/391] rename simd_cmp_impl.h to simd_cmp_impl.inc

Given how this function is used (no include guard with macro based
definitions / implementation), it's more correct for this to be a .inc
file:
https://google.github.io/styleguide/cppguide.html#Self_contained_Headers

+ move some functions into an anonymous namespace

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I0a5781338d63c62efc437d1676cb7a54aeed1238
---
 test/simd_cmp_avx2.cc                       | 2 +-
 test/{simd_cmp_impl.h => simd_cmp_impl.inc} | 4 ++--
 test/simd_cmp_sse2.cc                       | 2 +-
 test/simd_cmp_sse4.cc                       | 2 +-
 test/simd_cmp_ssse3.cc                      | 2 +-
 test/test.cmake                             | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
 rename test/{simd_cmp_impl.h => simd_cmp_impl.inc} (100%)

diff --git a/test/simd_cmp_avx2.cc b/test/simd_cmp_avx2.cc
index 53aa48c15e..fd9836023d 100644
--- a/test/simd_cmp_avx2.cc
+++ b/test/simd_cmp_avx2.cc
@@ -12,4 +12,4 @@
 #define ARCH AVX2
 #define ARCH_POSTFIX(name) name##_avx2
 #define SIMD_NAMESPACE simd_test_avx2
-#include "test/simd_cmp_impl.h"
+#include "test/simd_cmp_impl.inc"
diff --git a/test/simd_cmp_impl.h b/test/simd_cmp_impl.inc
similarity index 100%
rename from test/simd_cmp_impl.h
rename to test/simd_cmp_impl.inc
index 4bdf4975d3..0a9a195163 100644
--- a/test/simd_cmp_impl.h
+++ b/test/simd_cmp_impl.inc
@@ -398,6 +398,8 @@ c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
   return c_v256_align(a, b, shift);
 }
 
+namespace {
+
 // Wrappers around the the SAD and SSD functions
 uint32_t v64_sad_u8(v64 a, v64 b) {
   return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
@@ -461,8 +463,6 @@ uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
   return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
 }
 
-namespace {
-
 typedef void (*fptr)();
 
 typedef struct {
diff --git a/test/simd_cmp_sse2.cc b/test/simd_cmp_sse2.cc
index a5676d5e93..b73d712ab8 100644
--- a/test/simd_cmp_sse2.cc
+++ b/test/simd_cmp_sse2.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE2
 #define ARCH_POSTFIX(name) name##_sse2
 #define SIMD_NAMESPACE simd_test_sse2
-#include "test/simd_cmp_impl.h"
+#include "test/simd_cmp_impl.inc"
 #endif
diff --git a/test/simd_cmp_sse4.cc b/test/simd_cmp_sse4.cc
index f66aa11646..9b413f36d2 100644
--- a/test/simd_cmp_sse4.cc
+++ b/test/simd_cmp_sse4.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE4_1
 #define ARCH_POSTFIX(name) name##_sse4_1
 #define SIMD_NAMESPACE simd_test_sse4_1
-#include "test/simd_cmp_impl.h"
+#include "test/simd_cmp_impl.inc"
 #endif
diff --git a/test/simd_cmp_ssse3.cc b/test/simd_cmp_ssse3.cc
index 3775fdcc0b..c5ee015b6e 100644
--- a/test/simd_cmp_ssse3.cc
+++ b/test/simd_cmp_ssse3.cc
@@ -14,5 +14,5 @@
 #define ARCH SSSE3
 #define ARCH_POSTFIX(name) name##_ssse3
 #define SIMD_NAMESPACE simd_test_ssse3
-#include "test/simd_cmp_impl.h"
+#include "test/simd_cmp_impl.inc"
 #endif
diff --git a/test/test.cmake b/test/test.cmake
index 8090ad396b..4a6d9b2ae8 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -153,7 +153,7 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/lpf_test.cc"
               "${AOM_ROOT}/test/scan_test.cc"
               "${AOM_ROOT}/test/selfguided_filter_test.cc"
-              "${AOM_ROOT}/test/simd_cmp_impl.h"
+              "${AOM_ROOT}/test/simd_cmp_impl.inc"
               "${AOM_ROOT}/test/simd_impl.h")
 
   if(HAVE_SSE2)
-- 
GitLab


From bf6cdb1eac9624e11361541b3beefa01b01da99f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 16:55:19 +0000
Subject: [PATCH 231/391] Revert "av1_quantize: use optimized
 aom_quantize_b_helper()"

This reverts commit 0e4de1bf730d63ac4a563afbe8272f3241528dc7.

Reason for revert:
This causes mismatches in AV1/QMTest.TestNoMisMatchQM*

Change-Id: I7d5c26ce8537c7c1e6ec1a5a06bccf4dcf4ed202
---
 av1/encoder/av1_quantize.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index de17a28323..382d07c5b0 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -343,10 +343,10 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif  // !CONFIG_REALTIME_ONLY
 
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    aom_quantize_b_helper(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                          p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                          sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                            dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                            sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
     switch (qparam->log_scale) {
       case 0:
-- 
GitLab


From 3b9909b70b2a676d85c88df939642742733e5195 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 15:50:05 -0700
Subject: [PATCH 232/391] move comp_avg_pred_test.h contents to cc

This header isn't used elsewhere. This allows the classes and functions
to be moved into an anonymous namespace fixing some -Wmissing-prototypes
warnings.

Bug: aomedia:3416
Change-Id: I95bef2a3a2ea2eaa10473527265214cb17c3dd94
---
 test/comp_avg_pred_test.cc | 804 +++++++++++++++++++++++++++++++++++--
 test/comp_avg_pred_test.h  | 757 ----------------------------------
 test/test.cmake            |   1 -
 3 files changed, 770 insertions(+), 792 deletions(-)
 delete mode 100644 test/comp_avg_pred_test.h

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 94a888713a..cdeeea3266 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -9,26 +9,517 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "test/comp_avg_pred_test.h"
+#include <tuple>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+#include "av1/common/common_data.h"
+#include "aom_ports/aom_timer.h"
 
 using libaom_test::ACMRandom;
-using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
-using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgParam;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
-using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
-using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
-#if CONFIG_AV1_HIGHBITDEPTH
-using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
-using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
-    AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
-#endif
 using std::make_tuple;
 using std::tuple;
 
+namespace {
+
+const int kMaxSize = 128 + 32;  // padding
+
+typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride,
+                                    const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef void (*distwtdcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, const uint8_t *ref,
+                                   int ref_stride,
+                                   const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
+
+typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+    DISTWTDCOMPAVGUPSAMPLEDParam;
+
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*highbddistwtdcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search);
+
+typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
+
+typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGParam;
+
+#if HAVE_SSE2 || HAVE_NEON
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter, int is_hbd) {
+  (void)is_hbd;
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+BuildParams(highbddistwtdcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif  // HAVE_SSE2 || HAVE_NEON
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSSE3
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSSE3 || HAVE_NEON
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+    distwtdcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif  // HAVE_SSSE3 || HAVE_NEON
+
+class AV1DISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
+ public:
+  ~AV1DISTWTDCOMPAVGTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
+                                     in_w, in_h, ref8 + offset_r * w + offset_c,
+                                     in_w, &dist_wtd_comp_params);
+        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
+                  ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+                                   &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1DISTWTDCOMPAVGTest
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
+
+class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+         ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[jj][1 - ii];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_dist_wtd_comp_avg_upsampled_pred_c(
+                  nullptr, nullptr, 0, 0, nullptr, output,
+                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                  &dist_wtd_comp_params, subpel_search);
+              test_impl(nullptr, nullptr, 0, 0, nullptr, output2,
+                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                        &dist_wtd_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_dist_wtd_comp_avg_upsampled_pred_c(
+          nullptr, nullptr, 0, 0, nullptr, output, pred8, in_w, in_h, sub_x_q3,
+          sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(nullptr, nullptr, 0, 0, nullptr, output2, pred8, in_w, in_h,
+                sub_x_q3, sub_y_q3, ref8, in_w, &dist_wtd_comp_params,
+                subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
+
+class DistWtdCompAvgTest
+    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+      public ::testing::Test {
+ public:
+  DistWtdCompAvgTest()
+      : width_(GET_PARAM(0)), height_(GET_PARAM(1)), bd_(GET_PARAM(3)) {}
+
+  static void SetUpTestSuite() {
+    reference_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize));
+    ASSERT_NE(reference_data8_, nullptr);
+    second_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(second_pred8_, nullptr);
+    comp_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_, nullptr);
+    comp_pred8_test_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_test_, nullptr);
+    reference_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+    ASSERT_NE(reference_data16_, nullptr);
+    second_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(second_pred16_, nullptr);
+    comp_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_, nullptr);
+    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_test_, nullptr);
+  }
+
+  static void TearDownTestSuite() {
+    aom_free(reference_data8_);
+    reference_data8_ = nullptr;
+    aom_free(second_pred8_);
+    second_pred8_ = nullptr;
+    aom_free(comp_pred8_);
+    comp_pred8_ = nullptr;
+    aom_free(comp_pred8_test_);
+    comp_pred8_test_ = nullptr;
+    aom_free(reference_data16_);
+    reference_data16_ = nullptr;
+    aom_free(second_pred16_);
+    second_pred16_ = nullptr;
+    aom_free(comp_pred16_);
+    comp_pred16_ = nullptr;
+    aom_free(comp_pred16_test_);
+    comp_pred16_test_ = nullptr;
+  }
+
+ protected:
+  // Handle up to 4 128x128 blocks, with stride up to 256
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 128 * 256;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
+
+  void SetUp() override {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = AOM_BITS_8;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+      comp_pred_ = comp_pred8_;
+      comp_pred_test_ = comp_pred8_test_;
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+    }
+    mask_ = (1 << bit_depth_) - 1;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual uint8_t *GetReference(int block_idx) {
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+    return reference_data_ + block_idx * kDataBlockSize;
+  }
+
+  void ReferenceDistWtdCompAvg(int block_idx) {
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const second_pred8 = second_pred_;
+    uint8_t *const comp_pred8 = comp_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        }
+      }
+    }
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+        } else {
+          data16[h * stride + w] = fill_constant;
+        }
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+        }
+      }
+    }
+  }
+
+  void dist_wtd_comp_avg(int block_idx) {
+    const uint8_t *const reference = GetReference(block_idx);
+
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+                                          height_, reference, reference_stride_,
+                                          &jcp_param_));
+  }
+
+  void CheckCompAvg() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
+
+        ReferenceDistWtdCompAvg(0);
+        dist_wtd_comp_avg(0);
+
+        for (int y = 0; y < height_; ++y)
+          for (int x = 0; x < width_; ++x)
+            ASSERT_EQ(comp_pred_[y * width_ + x],
+                      comp_pred_test_[y * width_ + x]);
+      }
+    }
+  }
+
+  int width_, height_, mask_, bd_;
+  aom_bit_depth_t bit_depth_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
+  bool use_high_bit_depth_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+  int reference_stride_;
+  static uint8_t *comp_pred_;
+  static uint8_t *comp_pred8_;
+  static uint16_t *comp_pred16_;
+  static uint8_t *comp_pred_test_;
+  static uint8_t *comp_pred8_test_;
+  static uint16_t *comp_pred16_test_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
+
+  ACMRandom rnd_;
+};
+
 uint8_t *DistWtdCompAvgTest::reference_data_ = nullptr;
 uint8_t *DistWtdCompAvgTest::second_pred_ = nullptr;
 uint8_t *DistWtdCompAvgTest::comp_pred_ = nullptr;
@@ -42,7 +533,253 @@ uint16_t *DistWtdCompAvgTest::second_pred16_ = nullptr;
 uint16_t *DistWtdCompAvgTest::comp_pred16_ = nullptr;
 uint16_t *DistWtdCompAvgTest::comp_pred16_test_ = nullptr;
 
-namespace {
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class AV1HighBDDISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
+ public:
+  ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_highbd_dist_wtd_comp_avg_pred_c(
+            CONVERT_TO_BYTEPTR(output),
+            CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+            CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+            &dist_wtd_comp_params);
+        test_impl(CONVERT_TO_BYTEPTR(output2),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                  in_w, &dist_wtd_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_dist_wtd_comp_avg_pred_c(
+          CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+          CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
+                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDDISTWTDCOMPAVGTest
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
+
+class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+         ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[jj][1 - ii];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+                  nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, sub_x_q3, sub_y_q3,
+                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+                  &dist_wtd_comp_params, subpel_search);
+              test_impl(nullptr, nullptr, 0, 0, nullptr,
+                        CONVERT_TO_BYTEPTR(output2),
+                        CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+                        in_w, in_h, sub_x_q3, sub_y_q3,
+                        CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                        in_w, bd, &dist_wtd_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+          nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
+          CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+          subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output2),
+                CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+                subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
+    AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
 
@@ -50,8 +787,7 @@ TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_dist_wtd_comp_avg_pred_ssse3));
+                         BuildParams(aom_dist_wtd_comp_avg_pred_ssse3));
 #endif
 
 TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
@@ -63,15 +799,15 @@ TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
 }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+    BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_dist_wtd_comp_avg_upsampled_pred_neon));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+    BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_neon));
 #endif  // HAVE_NEON
 
 TEST_P(DistWtdCompAvgTest, MaxRef) {
@@ -214,14 +950,14 @@ TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
+                         BuildParams(aom_highbd_dist_wtd_comp_avg_pred_sse2,
+                                     1));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_highbd_dist_wtd_comp_avg_pred_neon, 1));
+                         BuildParams(aom_highbd_dist_wtd_comp_avg_pred_neon,
+                                     1));
 #endif
 
 TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
@@ -233,15 +969,15 @@ TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+    BuildParams(aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+    BuildParams(aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
 #endif
 
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
deleted file mode 100644
index fd8a8046a5..0000000000
--- a/test/comp_avg_pred_test.h
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
-#define AOM_TEST_COMP_AVG_PRED_TEST_H_
-
-#include <tuple>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "test/register_state_check.h"
-#include "av1/common/common_data.h"
-#include "aom_ports/aom_timer.h"
-
-namespace libaom_test {
-const int kMaxSize = 128 + 32;  // padding
-
-namespace AV1DISTWTDCOMPAVG {
-
-typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, const uint8_t *ref,
-                                    int ref_stride,
-                                    const DIST_WTD_COMP_PARAMS *jcp_param);
-
-typedef void (*distwtdcompavgupsampled_func)(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-
-typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, const uint8_t *ref,
-                                   int ref_stride,
-                                   const DIST_WTD_COMP_PARAMS *jcp_param);
-
-typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
-
-typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
-    DISTWTDCOMPAVGUPSAMPLEDParam;
-
-typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-typedef void (*highbddistwtdcompavgupsampled_func)(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
-    int subpel_search);
-
-typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
-    HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
-
-typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
-    HighbdDISTWTDCOMPAVGParam;
-
-::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
-    distwtdcompavg_func filter, int is_hbd) {
-  (void)is_hbd;
-  return ::testing::Combine(::testing::Range(8, 13, 2),
-                            ::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
-BuildParams(highbddistwtdcompavgupsampled_func filter) {
-  return ::testing::Combine(::testing::Range(8, 13, 2),
-                            ::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
-    distwtdcompavg_func filter) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
-    distwtdcompavgupsampled_func filter) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-class AV1DISTWTDCOMPAVGTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
- public:
-  ~AV1DISTWTDCOMPAVGTest() override = default;
-  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    uint8_t output[kMaxSize * kMaxSize];
-    uint8_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    for (int ii = 0; ii < 2; ii++) {
-      for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
-
-        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-        aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
-                                     in_w, in_h, ref8 + offset_r * w + offset_c,
-                                     in_w, &dist_wtd_comp_params);
-        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
-                  ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
-
-        for (int i = 0; i < in_h; ++i) {
-          for (int j = 0; j < in_w; ++j) {
-            int idx = i * in_w + j;
-            ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
-                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
-                << " = (" << i << ", " << j << ")";
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    uint8_t output[kMaxSize * kMaxSize];
-    uint8_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
-                                   &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGTest
-
-class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
- public:
-  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
-  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-    int sub_x_q3, sub_y_q3;
-    int subpel_search;
-    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
-         ++subpel_search) {
-      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-          for (int ii = 0; ii < 2; ii++) {
-            for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-              dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[jj][1 - ii];
-
-              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
-              aom_dist_wtd_comp_avg_upsampled_pred_c(
-                  nullptr, nullptr, 0, 0, nullptr, output,
-                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                  &dist_wtd_comp_params, subpel_search);
-              test_impl(nullptr, nullptr, 0, 0, nullptr, output2,
-                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                        &dist_wtd_comp_params, subpel_search);
-
-              for (int i = 0; i < in_h; ++i) {
-                for (int j = 0; j < in_w; ++j) {
-                  int idx = i * in_w + j;
-                  ASSERT_EQ(output[idx], output2[idx])
-                      << "Mismatch at unit tests for "
-                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
-                      << in_w << "x" << in_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << sub_y_q3 << ", "
-                      << sub_x_q3 << ")";
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    int sub_x_q3 = 0;
-    int sub_y_q3 = 0;
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_dist_wtd_comp_avg_upsampled_pred_c(
-          nullptr, nullptr, 0, 0, nullptr, output, pred8, in_w, in_h, sub_x_q3,
-          sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(nullptr, nullptr, 0, 0, nullptr, output2, pred8, in_w, in_h,
-                sub_x_q3, sub_y_q3, ref8, in_w, &dist_wtd_comp_params,
-                subpel_search);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-
-class DistWtdCompAvgTest
-    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
-      public ::testing::Test {
- public:
-  DistWtdCompAvgTest()
-      : width_(GET_PARAM(0)), height_(GET_PARAM(1)), bd_(GET_PARAM(3)) {}
-
-  static void SetUpTestSuite() {
-    reference_data8_ = reinterpret_cast<uint8_t *>(
-        aom_memalign(kDataAlignment, kDataBufferSize));
-    ASSERT_NE(reference_data8_, nullptr);
-    second_pred8_ =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
-    ASSERT_NE(second_pred8_, nullptr);
-    comp_pred8_ =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
-    ASSERT_NE(comp_pred8_, nullptr);
-    comp_pred8_test_ =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
-    ASSERT_NE(comp_pred8_test_, nullptr);
-    reference_data16_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
-    ASSERT_NE(reference_data16_, nullptr);
-    second_pred16_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
-    ASSERT_NE(second_pred16_, nullptr);
-    comp_pred16_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
-    ASSERT_NE(comp_pred16_, nullptr);
-    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
-    ASSERT_NE(comp_pred16_test_, nullptr);
-  }
-
-  static void TearDownTestSuite() {
-    aom_free(reference_data8_);
-    reference_data8_ = nullptr;
-    aom_free(second_pred8_);
-    second_pred8_ = nullptr;
-    aom_free(comp_pred8_);
-    comp_pred8_ = nullptr;
-    aom_free(comp_pred8_test_);
-    comp_pred8_test_ = nullptr;
-    aom_free(reference_data16_);
-    reference_data16_ = nullptr;
-    aom_free(second_pred16_);
-    second_pred16_ = nullptr;
-    aom_free(comp_pred16_);
-    comp_pred16_ = nullptr;
-    aom_free(comp_pred16_test_);
-    comp_pred16_test_ = nullptr;
-  }
-
- protected:
-  // Handle up to 4 128x128 blocks, with stride up to 256
-  static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 128 * 256;
-  static const int kDataBufferSize = 4 * kDataBlockSize;
-
-  void SetUp() override {
-    if (bd_ == -1) {
-      use_high_bit_depth_ = false;
-      bit_depth_ = AOM_BITS_8;
-      reference_data_ = reference_data8_;
-      second_pred_ = second_pred8_;
-      comp_pred_ = comp_pred8_;
-      comp_pred_test_ = comp_pred8_test_;
-    } else {
-      use_high_bit_depth_ = true;
-      bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
-      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
-      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
-      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
-      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
-    }
-    mask_ = (1 << bit_depth_) - 1;
-    reference_stride_ = width_ * 2;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
-  virtual uint8_t *GetReference(int block_idx) {
-    if (use_high_bit_depth_)
-      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
-                                block_idx * kDataBlockSize);
-    return reference_data_ + block_idx * kDataBlockSize;
-  }
-
-  void ReferenceDistWtdCompAvg(int block_idx) {
-    const uint8_t *const reference8 = GetReference(block_idx);
-    const uint8_t *const second_pred8 = second_pred_;
-    uint8_t *const comp_pred8 = comp_pred_;
-    const uint16_t *const reference16 =
-        CONVERT_TO_SHORTPTR(GetReference(block_idx));
-    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
-    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
-    for (int h = 0; h < height_; ++h) {
-      for (int w = 0; w < width_; ++w) {
-        if (!use_high_bit_depth_) {
-          const int tmp =
-              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
-              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
-          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
-        } else {
-          const int tmp =
-              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
-              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
-          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
-        }
-      }
-    }
-  }
-
-  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
-    uint8_t *data8 = data;
-    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
-    for (int h = 0; h < height_; ++h) {
-      for (int w = 0; w < width_; ++w) {
-        if (!use_high_bit_depth_) {
-          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
-        } else {
-          data16[h * stride + w] = fill_constant;
-        }
-      }
-    }
-  }
-
-  void FillRandom(uint8_t *data, int stride) {
-    uint8_t *data8 = data;
-    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
-    for (int h = 0; h < height_; ++h) {
-      for (int w = 0; w < width_; ++w) {
-        if (!use_high_bit_depth_) {
-          data8[h * stride + w] = rnd_.Rand8();
-        } else {
-          data16[h * stride + w] = rnd_.Rand16() & mask_;
-        }
-      }
-    }
-  }
-
-  void dist_wtd_comp_avg(int block_idx) {
-    const uint8_t *const reference = GetReference(block_idx);
-
-    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
-                                          height_, reference, reference_stride_,
-                                          &jcp_param_));
-  }
-
-  void CheckCompAvg() {
-    for (int j = 0; j < 2; ++j) {
-      for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
-        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
-
-        ReferenceDistWtdCompAvg(0);
-        dist_wtd_comp_avg(0);
-
-        for (int y = 0; y < height_; ++y)
-          for (int x = 0; x < width_; ++x)
-            ASSERT_EQ(comp_pred_[y * width_ + x],
-                      comp_pred_test_[y * width_ + x]);
-      }
-    }
-  }
-
-  int width_, height_, mask_, bd_;
-  aom_bit_depth_t bit_depth_;
-  static uint8_t *reference_data_;
-  static uint8_t *second_pred_;
-  bool use_high_bit_depth_;
-  static uint8_t *reference_data8_;
-  static uint8_t *second_pred8_;
-  static uint16_t *reference_data16_;
-  static uint16_t *second_pred16_;
-  int reference_stride_;
-  static uint8_t *comp_pred_;
-  static uint8_t *comp_pred8_;
-  static uint16_t *comp_pred16_;
-  static uint8_t *comp_pred_test_;
-  static uint8_t *comp_pred8_test_;
-  static uint16_t *comp_pred16_test_;
-  DIST_WTD_COMP_PARAMS jcp_param_;
-
-  ACMRandom rnd_;
-};
-
-#if CONFIG_AV1_HIGHBITDEPTH
-class AV1HighBDDISTWTDCOMPAVGTest
-    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
- public:
-  ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
-  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(2);
-    const int bd = GET_PARAM(0);
-    uint16_t pred8[kMaxSize * kMaxSize];
-    uint16_t ref8[kMaxSize * kMaxSize];
-    uint16_t output[kMaxSize * kMaxSize];
-    uint16_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    for (int ii = 0; ii < 2; ii++) {
-      for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
-
-        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-        aom_highbd_dist_wtd_comp_avg_pred_c(
-            CONVERT_TO_BYTEPTR(output),
-            CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
-            CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
-            &dist_wtd_comp_params);
-        test_impl(CONVERT_TO_BYTEPTR(output2),
-                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
-                  in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
-                  in_w, &dist_wtd_comp_params);
-
-        for (int i = 0; i < in_h; ++i) {
-          for (int j = 0; j < in_w; ++j) {
-            int idx = i * in_w + j;
-            ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
-                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
-                << " = (" << i << ", " << j << ")";
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(2);
-    const int bd = GET_PARAM(0);
-    uint16_t pred8[kMaxSize * kMaxSize];
-    uint16_t ref8[kMaxSize * kMaxSize];
-    uint16_t output[kMaxSize * kMaxSize];
-    uint16_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_highbd_dist_wtd_comp_avg_pred_c(
-          CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
-          CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
-                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDDISTWTDCOMPAVGTest
-
-class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
- public:
-  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
-  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(2);
-    const int bd = GET_PARAM(0);
-    uint16_t pred8[kMaxSize * kMaxSize];
-    uint16_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
-    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-    int sub_x_q3, sub_y_q3;
-    int subpel_search;
-    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
-         ++subpel_search) {
-      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-          for (int ii = 0; ii < 2; ii++) {
-            for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-              dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[jj][1 - ii];
-
-              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
-              aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
-                  nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
-                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
-                  in_h, sub_x_q3, sub_y_q3,
-                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
-                  &dist_wtd_comp_params, subpel_search);
-              test_impl(nullptr, nullptr, 0, 0, nullptr,
-                        CONVERT_TO_BYTEPTR(output2),
-                        CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
-                        in_w, in_h, sub_x_q3, sub_y_q3,
-                        CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
-                        in_w, bd, &dist_wtd_comp_params, subpel_search);
-
-              for (int i = 0; i < in_h; ++i) {
-                for (int j = 0; j < in_w; ++j) {
-                  int idx = i * in_w + j;
-                  ASSERT_EQ(output[idx], output2[idx])
-                      << "Mismatch at unit tests for "
-                         "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
-                      << in_w << "x" << in_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << sub_y_q3 << ", "
-                      << sub_x_q3 << ")";
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(2);
-    const int bd = GET_PARAM(0);
-    uint16_t pred8[kMaxSize * kMaxSize];
-    uint16_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
-    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-    int sub_x_q3 = 0;
-    int sub_y_q3 = 0;
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
-    for (int i = 0; i < num_loops; ++i)
-      aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
-          nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
-          CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
-          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
-          subpel_search);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
-           in_h, 1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output2),
-                CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
-                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
-                subpel_search);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
-           in_h, 1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};      // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-}  // namespace AV1DISTWTDCOMPAVG
-}  // namespace libaom_test
-
-#endif  // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/test/test.cmake b/test/test.cmake
index 4a6d9b2ae8..a3e0d6abec 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -198,7 +198,6 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
               "${AOM_ROOT}/test/blend_a64_mask_test.cc"
               "${AOM_ROOT}/test/comp_avg_pred_test.cc"
-              "${AOM_ROOT}/test/comp_avg_pred_test.h"
               "${AOM_ROOT}/test/comp_mask_pred_test.cc"
               "${AOM_ROOT}/test/disflow_test.cc"
               "${AOM_ROOT}/test/encodemb_test.cc"
-- 
GitLab


From 790bfcb7de692d78755934ec95d8e1cf6989d379 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 16:21:00 -0700
Subject: [PATCH 233/391] comp_avg_pred_test.cc: fix class names

Along with the parameter names used in the tests. Use standard mixed
case rather than all capital characters.

Change-Id: I016867c20eabaaefd63ec68763ff98abddfb8395
---
 test/comp_avg_pred_test.cc | 96 +++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index cdeeea3266..b9375d14b6 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -45,10 +45,10 @@ typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
                                    int ref_stride,
                                    const DIST_WTD_COMP_PARAMS *jcp_param);
 
-typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
+typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> AV1DistWtdCompAvgParam;
 
 typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
-    DISTWTDCOMPAVGUPSAMPLEDParam;
+    AV1DistWtdCompAvgUpsampledParam;
 
 typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
 
@@ -61,13 +61,13 @@ typedef void (*highbddistwtdcompavgupsampled_func)(
     int subpel_search);
 
 typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
-    HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
+    HighbdDistWtdCompAvgUpsampledParam;
 
 typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
-    HighbdDISTWTDCOMPAVGParam;
+    HighbdDistWtdCompAvgParam;
 
 #if HAVE_SSE2 || HAVE_NEON
-::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+::testing::internal::ParamGenerator<HighbdDistWtdCompAvgParam> BuildParams(
     distwtdcompavg_func filter, int is_hbd) {
   (void)is_hbd;
   return ::testing::Combine(::testing::Range(8, 13, 2),
@@ -75,7 +75,7 @@ typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+::testing::internal::ParamGenerator<HighbdDistWtdCompAvgUpsampledParam>
 BuildParams(highbddistwtdcompavgupsampled_func filter) {
   return ::testing::Combine(::testing::Range(8, 13, 2),
                             ::testing::Values(filter),
@@ -85,7 +85,7 @@ BuildParams(highbddistwtdcompavgupsampled_func filter) {
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSSE3
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+::testing::internal::ParamGenerator<AV1DistWtdCompAvgParam> BuildParams(
     distwtdcompavg_func filter) {
   return ::testing::Combine(::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
@@ -93,17 +93,17 @@ BuildParams(highbddistwtdcompavgupsampled_func filter) {
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSSE3 || HAVE_NEON
-::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
-    distwtdcompavgupsampled_func filter) {
+::testing::internal::ParamGenerator<AV1DistWtdCompAvgUpsampledParam>
+BuildParams(distwtdcompavgupsampled_func filter) {
   return ::testing::Combine(::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 #endif  // HAVE_SSSE3 || HAVE_NEON
 
-class AV1DISTWTDCOMPAVGTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
+class AV1DistWtdCompAvgTest
+    : public ::testing::TestWithParam<AV1DistWtdCompAvgParam> {
  public:
-  ~AV1DISTWTDCOMPAVGTest() override = default;
+  ~AV1DistWtdCompAvgTest() override = default;
   void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
@@ -144,7 +144,7 @@ class AV1DISTWTDCOMPAVGTest
           for (int j = 0; j < in_w; ++j) {
             int idx = i * in_w + j;
             ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
+                << "Mismatch at unit tests for AV1DistWtdCompAvgTest\n"
                 << in_w << "x" << in_h << " Pixel mismatch at index " << idx
                 << " = (" << i << ", " << j << ")";
           }
@@ -201,14 +201,14 @@ class AV1DISTWTDCOMPAVGTest
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGTest
+};  // class AV1DistWtdCompAvgTest
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DistWtdCompAvgTest);
 
-class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
+class AV1DistWtdCompAvgUpsampledTest
+    : public ::testing::TestWithParam<AV1DistWtdCompAvgUpsampledParam> {
  public:
-  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  ~AV1DistWtdCompAvgUpsampledTest() override = default;
   void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
@@ -261,7 +261,7 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
                   int idx = i * in_w + j;
                   ASSERT_EQ(output[idx], output2[idx])
                       << "Mismatch at unit tests for "
-                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
+                         "AV1DistWtdCompAvgUpsampledTest\n"
                       << in_w << "x" << in_h << " Pixel mismatch at index "
                       << idx << " = (" << i << ", " << j
                       << "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -330,9 +330,9 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+};  // class AV1DistWtdCompAvgUpsampledTest
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DistWtdCompAvgUpsampledTest);
 
 class DistWtdCompAvgTest
     : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
@@ -536,10 +536,10 @@ uint16_t *DistWtdCompAvgTest::comp_pred16_test_ = nullptr;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
 
 #if CONFIG_AV1_HIGHBITDEPTH
-class AV1HighBDDISTWTDCOMPAVGTest
-    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
+class AV1HighBDDistWtdCompAvgTest
+    : public ::testing::TestWithParam<HighbdDistWtdCompAvgParam> {
  public:
-  ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
+  ~AV1HighBDDistWtdCompAvgTest() override = default;
   void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
@@ -584,7 +584,7 @@ class AV1HighBDDISTWTDCOMPAVGTest
           for (int j = 0; j < in_w; ++j) {
             int idx = i * in_w + j;
             ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
+                << "Mismatch at unit tests for AV1HighBDDistWtdCompAvgTest\n"
                 << in_w << "x" << in_h << " Pixel mismatch at index " << idx
                 << " = (" << i << ", " << j << ")";
           }
@@ -643,14 +643,14 @@ class AV1HighBDDISTWTDCOMPAVGTest
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDDISTWTDCOMPAVGTest
+};  // class AV1HighBDDistWtdCompAvgTest
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDistWtdCompAvgTest);
 
-class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
+class AV1HighBDDistWtdCompAvgUpsampledTest
+    : public ::testing::TestWithParam<HighbdDistWtdCompAvgUpsampledParam> {
  public:
-  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  ~AV1HighBDDistWtdCompAvgUpsampledTest() override = default;
   void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
@@ -706,7 +706,7 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
                   int idx = i * in_w + j;
                   ASSERT_EQ(output[idx], output2[idx])
                       << "Mismatch at unit tests for "
-                         "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
+                         "AV1HighBDDistWtdCompAvgUpsampledTest\n"
                       << in_w << "x" << in_h << " Pixel mismatch at index "
                       << idx << " = (" << i << ", " << j
                       << "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -775,38 +775,38 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+};  // class AV1HighBDDistWtdCompAvgUpsampledTest
 
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
-    AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
+    AV1HighBDDistWtdCompAvgUpsampledTest);
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+TEST_P(AV1DistWtdCompAvgTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
 
-TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1DistWtdCompAvgTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DistWtdCompAvgTest,
                          BuildParams(aom_dist_wtd_comp_avg_pred_ssse3));
 #endif
 
-TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1DistWtdCompAvgUpsampledTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(0));
 }
 
-TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1DistWtdCompAvgUpsampledTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(0));
 }
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+    SSSE3, AV1DistWtdCompAvgUpsampledTest,
     BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+    NEON, AV1DistWtdCompAvgUpsampledTest,
     BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_neon));
 #endif  // HAVE_NEON
 
@@ -940,43 +940,43 @@ INSTANTIATE_TEST_SUITE_P(NEON, DistWtdCompAvgTest,
 #endif  // HAVE_NEON
 
 #if CONFIG_AV1_HIGHBITDEPTH
-TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
+TEST_P(AV1HighBDDistWtdCompAvgTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
 
-TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
+TEST_P(AV1HighBDDistWtdCompAvgTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDistWtdCompAvgTest,
                          BuildParams(aom_highbd_dist_wtd_comp_avg_pred_sse2,
                                      1));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGTest,
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDistWtdCompAvgTest,
                          BuildParams(aom_highbd_dist_wtd_comp_avg_pred_neon,
                                      1));
 #endif
 
-TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1HighBDDistWtdCompAvgUpsampledTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
 
-TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1HighBDDistWtdCompAvgUpsampledTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+    SSE2, AV1HighBDDistWtdCompAvgUpsampledTest,
     BuildParams(aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+    NEON, AV1HighBDDistWtdCompAvgUpsampledTest,
     BuildParams(aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
 #endif
 
-- 
GitLab


From 3ca0c74cd7cb632362f5da3979b593262ce32689 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 15:33:04 -0700
Subject: [PATCH 234/391] add av1/encoder/blockiness.h

This adds a prototype for av1_get_blockiness() and fixes a
-Wmissing-prototypes warning.

+ removed unused headers from blockiness.c

Bug: aomedia:3416
Change-Id: If1fa1a5d6b978c929ec28f613ff87ff97a1e083e
---
 av1/av1.cmake            |  3 ++-
 av1/encoder/blockiness.c | 11 +++--------
 av1/encoder/blockiness.h | 19 +++++++++++++++++++
 av1/encoder/encoder.c    |  7 +++----
 4 files changed, 27 insertions(+), 13 deletions(-)
 create mode 100644 av1/encoder/blockiness.h

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 4ca5007c12..1955255640 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -525,7 +525,8 @@ if(CONFIG_INSPECTION)
 endif()
 
 if(CONFIG_INTERNAL_STATS)
-  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c"
+              "${AOM_ROOT}/av1/encoder/blockiness.h")
 endif()
 
 if(CONFIG_REALTIME_ONLY)
diff --git a/av1/encoder/blockiness.c b/av1/encoder/blockiness.c
index 8c93df38d1..6945592d18 100644
--- a/av1/encoder/blockiness.c
+++ b/av1/encoder/blockiness.c
@@ -9,15 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "config/av1_rtcd.h"
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
+#include "av1/encoder/blockiness.h"
 
-#include "av1/common/common.h"
-#include "av1/common/filter.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
+#include <stdint.h>
+#include <stdlib.h>
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
diff --git a/av1/encoder/blockiness.h b/av1/encoder/blockiness.h
new file mode 100644
index 0000000000..e381a421a7
--- /dev/null
+++ b/av1/encoder/blockiness.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BLOCKINESS_H_
+#define AOM_AV1_ENCODER_BLOCKINESS_H_
+
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch, int width,
+                          int height);
+
+#endif  // AOM_AV1_ENCODER_BLOCKINESS_H_
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index fd293b1940..f8fca381d9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -56,6 +56,9 @@
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
+#if CONFIG_INTERNAL_STATS
+#include "av1/encoder/blockiness.h"
+#endif
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
@@ -4202,10 +4205,6 @@ void print_entropy_stats(AV1_PRIMARY *const ppi) {
 #endif  // CONFIG_ENTROPY_STATS
 
 #if CONFIG_INTERNAL_STATS
-extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
-                                 const unsigned char *img2, int img2_pitch,
-                                 int width, int height);
-
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
   s->stat[STAT_Y] += y;
-- 
GitLab


From 6db5fabf42c124d97d0a3593bca2e2f69d15d43e Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 13 Jun 2024 13:52:58 +0100
Subject: [PATCH 235/391] Halve filter values in Armv8.0 Neon convolve8 filter
 functions

All filter values that can be used in aom_convolve8_(horiz|vert)_neon
are even. Halve these filters to remove the need for saturating
arithmetic in the convolution kernels.

Change-Id: I604d1eeb420ded5936d72f36a9f301fb0d09cecd
---
 aom_dsp/arm/aom_convolve8_neon.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 229d58c483..0928b93275 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -37,12 +37,12 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
   sum = vmla_lane_s16(sum, s1, filter_lo, 1);
   sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
   sum = vmla_lane_s16(sum, s5, filter_hi, 1);
   sum = vmla_lane_s16(sum, s6, filter_hi, 2);
   sum = vmla_lane_s16(sum, s7, filter_hi, 3);
 
-  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
-  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
   return sum;
 }
 
@@ -57,13 +57,14 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
 
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
@@ -71,7 +72,9 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
                                              ptrdiff_t dst_stride,
                                              const int16_t *filter_x, int w,
                                              int h) {
-  const int16x8_t filter = vld1q_s16(filter_x);
+  // All filter values are even so halve them to reduce intermediate precision
+  // requirements.
+  const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_x), 1);
 
   if (h == 4) {
     uint8x8_t t0, t1, t2, t3;
@@ -101,8 +104,9 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       transpose_elems_inplace_u8_4x4(&d01, &d23);
 
@@ -334,7 +338,9 @@ static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
                                             ptrdiff_t dst_stride,
                                             const int16_t *filter_y, int w,
                                             int h) {
-  const int16x8_t filter = vld1q_s16(filter_y);
+  // All filter values are even so halve them to reduce intermediate precision
+  // requirements.
+  const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_y), 1);
 
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
@@ -362,8 +368,9 @@ static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-- 
GitLab


From 7a30de497ab44b3d7c6abc1a38d9aa5fe8804211 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 11 Jun 2024 22:50:26 +0100
Subject: [PATCH 236/391] Add unit tests for aom_scaled_2d_(c|ssse3|neon)

Add unit tests for C, SSSE3 and Neon implementations of aom_scaled_2d
as none currently exist. This is a precursor to adding further Arm
optimizations in later changes.

Change-Id: I2e07e0c06df8c37e90863c4f5589cab561d606cd
---
 test/convolve_test.cc | 203 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 195 insertions(+), 8 deletions(-)

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 18cfc71170..1895ec5108 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -30,7 +30,12 @@
 namespace {
 
 static const unsigned int kMaxDimension = MAX_SB_SIZE;
-
+static const int kDataAlignment = 16;
+static const int kOuterBlockSize = 4 * kMaxDimension;
+static const int kInputStride = kOuterBlockSize;
+static const int kOutputStride = kOuterBlockSize;
+static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 static const int16_t kInvalidFilter[8] = {};
 static const int kNumFilterBanks = SWITCHABLE_FILTERS;
 static const int kNumFilters = 16;
@@ -321,13 +326,6 @@ class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
   }
 
  protected:
-  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 4 * kMaxDimension;
-  static const int kInputStride = kOuterBlockSize;
-  static const int kOutputStride = kOuterBlockSize;
-  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
-  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
-
   int Width() const { return GET_PARAM(0); }
   int Height() const { return GET_PARAM(1); }
   int BorderLeft() const {
@@ -942,4 +940,193 @@ INSTANTIATE_TEST_SUITE_P(SVE, HighbdConvolveTest,
 #endif
 #endif  // HAVE_SVE
 
+typedef void (*ConvolveScale2DFunc)(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h);
+
+typedef std::tuple<int, int, ConvolveScale2DFunc> ConvolveScale2DParam;
+
+class ConvolveScale2DTest
+    : public ::testing::TestWithParam<ConvolveScale2DParam> {
+ public:
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  void SetUp() override {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t *>(
+                 aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
+             1;
+    output_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+    output_ref_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+
+    ASSERT_NE(input_, nullptr);
+    ASSERT_NE(output_, nullptr);
+    ASSERT_NE(output_ref_, nullptr);
+
+    test_func_ = GET_PARAM(2);
+    /* Set up guard blocks for an inner block centered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        output_[i] = 255;
+      } else {
+        output_[i] = 0;
+      }
+    }
+
+    ::libaom_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i) {
+      if (i & 1) {
+        input_[i] = 255;
+      } else {
+        input_[i] = prng.Rand8Extremes();
+      }
+    }
+  }
+
+  void TearDown() override {
+    aom_free(input_ - 1);
+    input_ = nullptr;
+    aom_free(output_);
+    output_ = nullptr;
+    aom_free(output_ref_);
+    output_ref_ = nullptr;
+  }
+
+  void SetConstantInput(int value) { memset(input_, value, kInputBufferSize); }
+
+  void CopyOutputToRef() { memcpy(output_ref_, output_, kOutputBufferSize); }
+
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
+    }
+  }
+
+  uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + offset;
+  }
+
+  uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + offset;
+  }
+
+  uint8_t *output_ref() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + offset;
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const { return list[index]; }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+    list[index] = (uint8_t)val;
+  }
+
+  ConvolveScale2DFunc test_func_;
+  uint8_t *input_;
+  uint8_t *output_;
+  uint8_t *output_ref_;
+};
+
+TEST_P(ConvolveScale2DTest, DISABLED_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const filter =
+      (const InterpKernel *)av1_get_interp_filter_kernel(EIGHTTAP_REGULAR,
+                                                         USE_8_TAPS);
+  const int kNumTests = 10000;
+  const int width = Width();
+  const int height = Height();
+  const int frac = 8;
+  const int step = 16;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    test_func_(in, kInputStride, out, kOutputStride, filter, frac, step, frac,
+               step, width, height);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve_scale_2d_%dx%d_%d: %d us\n", width, height, 8, elapsed_time);
+}
+
+TEST_P(ConvolveScale2DTest, Correctness) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t ref[kOutputStride * kMaxDimension];
+
+  ::libaom_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      const uint16_t r = prng.Rand8Extremes();
+      assign_val(in, y * kInputStride + x, r);
+    }
+  }
+
+  for (int subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS;
+       ++subpel_search) {
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpFilter filter = static_cast<InterpFilter>(filter_bank);
+      const InterpKernel *filters =
+          (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+                                                             subpel_search);
+      for (int frac = 0; frac < 16; ++frac) {
+        for (int step = 1; step <= 32; ++step) {
+          aom_scaled_2d_c(in, kInputStride, ref, kOutputStride, filters, frac,
+                          step, frac, step, Width(), Height());
+          API_REGISTER_STATE_CHECK(
+              test_func_(in, kInputStride, out, kOutputStride, filters, frac,
+                         step, frac, step, Width(), Height()));
+
+          CheckGuardBlocks();
+
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x) {
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "x == " << x << ", y == " << y << ", frac == " << frac
+                  << ", step == " << step;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScale2DTest,
+                         ::testing::Values(ALL_SIZES_64(aom_scaled_2d_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScale2DTest,
+                         ::testing::Values(ALL_SIZES_64(aom_scaled_2d_neon)));
+#endif  // HAVE_NEON
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveScale2DTest,
+                         ::testing::Values(ALL_SIZES_64(aom_scaled_2d_ssse3)));
+#endif  // HAVE_SSSE3
+
 }  // namespace
-- 
GitLab


From 0ba83c722b09c2c5a173cd66f32ef7d1b7b893b4 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 18 Jun 2024 11:01:14 +0100
Subject: [PATCH 237/391] Move aom_scaled_2d_neon to aom_dsp/arm

Move aom_scaled_2d_neon from av1/common/arm/ to aom_dsp/arm to be
consistent with the rtcd definition and the SSSE3 implementation.

Change-Id: I985bfb77b60163902e5689d8c45f2bc734b6f046
---
 aom_dsp/aom_dsp.cmake                   |   1 +
 aom_dsp/arm/aom_scaled_convolve8_neon.c | 359 ++++++++++++++++++++++++
 av1/common/arm/resize_neon.c            | 341 +---------------------
 3 files changed, 372 insertions(+), 329 deletions(-)
 create mode 100644 aom_dsp/arm/aom_scaled_convolve8_neon.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 85947c5a50..da49a6d972 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -109,6 +109,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon.c b/aom_dsp/arm/aom_scaled_convolve8_neon.c
new file mode 100644
index 0000000000..f81a06be98
--- /dev/null
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filter) {
+  int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
+}
+
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          store_u8_4x1(&temp[4 * z], d);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]);
+        store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]);
+        store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]);
+        store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]);
+      }
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
+                                         &s[5], &s[6], &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+                                     &d[6], &d[7]);
+      store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
+                   d[6], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      store_u8_4x1(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index ae0da3d018..898bd5a54e 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -16,49 +16,13 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
 #include "config/av1_rtcd.h"
-#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-
-  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
-  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
-  return sum;
-}
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x8_t s4, const int16x8_t s5,
-                                    const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filter) {
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filter) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
 
-  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
-}
-
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filter) {
   int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
   int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
   int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
@@ -68,7 +32,16 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
   int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
-  return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
+  int16x8_t sum = vmulq_lane_s16(ss0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, ss1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, ss2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, ss5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, ss6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, ss7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(ss3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(ss4, filter_hi, 0));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
@@ -887,293 +860,3 @@ void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
     aom_extend_frame_borders(dst, num_planes);
   }
 }
-
-static INLINE void scaledconvolve_horiz_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  y = h;
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
-    do {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8], d;
-          int16x8_t ss[4];
-          int16x4_t t[8], tt;
-
-          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
-          transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
-          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-          t[0] = vget_low_s16(ss[0]);
-          t[1] = vget_low_s16(ss[1]);
-          t[2] = vget_low_s16(ss[2]);
-          t[3] = vget_low_s16(ss[3]);
-          t[4] = vget_high_s16(ss[0]);
-          t[5] = vget_high_s16(ss[1]);
-          t[6] = vget_high_s16(ss[2]);
-          t[7] = vget_high_s16(ss[3]);
-
-          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters);
-          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-          store_u8_4x1(&temp[4 * z], d);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      {
-        const uint8x8x4_t d4 = vld4_u8(temp);
-        store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]);
-        store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]);
-        store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]);
-        store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]);
-      }
-      x += 4;
-    } while (x < w);
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    y -= 4;
-  } while (y > 0);
-}
-
-static INLINE void scaledconvolve_horiz_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas. The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = (h + 7) & ~7;
-
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
-    do {
-      uint8x8_t d[8];
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-
-        if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8];
-          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
-                      &s[5], &s[6], &s[7]);
-          transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
-                                         &s[5], &s[6], &s[7]);
-          d[0] = scale_filter_8(s, filters);
-          vst1_u8(&temp[8 * z], d[0]);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
-                  &d[7]);
-      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
-                                     &d[6], &d[7]);
-      store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
-                   d[6], d[7]);
-      x += 8;
-    } while (x < w);
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static INLINE void scaledconvolve_vert_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      int16x4_t t[8], tt;
-
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
-      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
-      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
-      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
-      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
-      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
-      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
-      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
-      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-      store_u8_4x1(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      d = scale_filter_8(s, filters);
-      vst1_u8(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w16(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int x, y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      x = 0;
-      do {
-        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-        uint8x16_t ss[8];
-        uint8x8_t s[8], d[2];
-        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
-                     &ss[5], &ss[6], &ss[7]);
-        s[0] = vget_low_u8(ss[0]);
-        s[1] = vget_low_u8(ss[1]);
-        s[2] = vget_low_u8(ss[2]);
-        s[3] = vget_low_u8(ss[3]);
-        s[4] = vget_low_u8(ss[4]);
-        s[5] = vget_low_u8(ss[5]);
-        s[6] = vget_low_u8(ss[6]);
-        s[7] = vget_low_u8(ss[7]);
-        d[0] = scale_filter_8(s, filters);
-
-        s[0] = vget_high_u8(ss[0]);
-        s[1] = vget_high_u8(ss[1]);
-        s[2] = vget_high_u8(ss[2]);
-        s[3] = vget_high_u8(ss[3]);
-        s[4] = vget_high_u8(ss[4]);
-        s[5] = vget_high_u8(ss[5]);
-        s[6] = vget_high_u8(ss[6]);
-        s[7] = vget_high_u8(ss[7]);
-        d[1] = scale_filter_8(s, filters);
-        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
-        src_y += 16;
-        x += 16;
-      } while (x < w);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const InterpKernel *filter,
-                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
-                        int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  // When calling in frame scaling function, the smallest scaling factor is x1/4
-  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
-  // big enough.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
-  assert(x_step_q4 <= 64);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  }
-}
-- 
GitLab


From 98cbdb3e0cc28ce66bf5e332e7d1c57d4dbca281 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Tue, 11 Jun 2024 00:15:12 +0100
Subject: [PATCH 238/391] Refactor and optimize aom_scaled_2d_neon

Tidy up the standard bitdepth Armv8.0 Neon implementation of
aom_scaled_2d. Also halve the filter values (since they're all even)
to avoid saturating arithmetic in convolution kernels.

Change-Id: I6485e609bf667f4517dc480470eca8b1025ac278
---
 aom_dsp/arm/aom_convolve8_neon.c        |  41 --
 aom_dsp/arm/aom_convolve8_neon.h        |  44 +-
 aom_dsp/arm/aom_scaled_convolve8_neon.c | 516 +++++++++++-------------
 3 files changed, 289 insertions(+), 312 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 0928b93275..d2f13ff13e 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -26,47 +26,6 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-
-  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
-  return sum;
-}
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x8_t s4, const int16x8_t s5,
-                                    const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-
-  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
-  // We halved the filter values so -1 from right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index 0b6e5245a4..d1384a76ef 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -14,8 +14,50 @@
 
 #include <arm_neon.h>
 
-#include "config/aom_config.h"
+#include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_config.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
 
 static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon.c b/aom_dsp/arm/aom_scaled_convolve8_neon.c
index f81a06be98..3c11133b87 100644
--- a/aom_dsp/arm/aom_scaled_convolve8_neon.c
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon.c
@@ -12,310 +12,294 @@
 #include <arm_neon.h>
 #include <assert.h>
 
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-
-  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
-  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
-  return sum;
-}
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x8_t s4, const int16x8_t s5,
-                                    const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-
-  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
-}
-
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filter) {
-  int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-  int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-  int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-  int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-  int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4]));
-  int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5]));
-  int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
-  int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
-
-  return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
-}
-
-static INLINE void scaledconvolve_horiz_w4(
+static INLINE void scaled_convolve_horiz_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-
-  src -= SUBPEL_TAPS / 2 - 1;
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
 
-  y = h;
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
+  if (w == 4) {
     do {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8], d;
-          int16x8_t ss[4];
-          int16x4_t t[8], tt;
-
-          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
-          transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
-          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-          t[0] = vget_low_s16(ss[0]);
-          t[1] = vget_low_s16(ss[1]);
-          t[2] = vget_low_s16(ss[2]);
-          t[3] = vget_low_s16(ss[3]);
-          t[4] = vget_high_s16(ss[0]);
-          t[5] = vget_high_s16(ss[1]);
-          t[6] = vget_high_s16(ss[2]);
-          t[7] = vget_high_s16(ss[3]);
-
-          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters);
-          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-          store_u8_4x1(&temp[4 * z], d);
+          // Halve filter values (all even) to avoid the need for saturating
+          // arithmetic in convolution kernels.
+          const int16x8_t filter =
+              vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+          uint8x8_t t0, t1, t2, t3;
+          load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+          transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+          int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+          int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+          int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          // We halved the filter values so -1 from right shift.
+          uint8x8_t d0 =
+              vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1);
+
+          store_u8_4x1(&temp[4 * r], d0);
         } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 4; ++c) {
+            temp[r * 4 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 4x4 filters values back to dst
-      {
-        const uint8x8x4_t d4 = vld4_u8(temp);
-        store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]);
-        store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]);
-        store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]);
-        store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]);
-      }
-      x += 4;
-    } while (x < w);
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
 
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    y -= 4;
-  } while (y > 0);
-}
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
 
-static INLINE void scaledconvolve_horiz_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
+      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
-  // This function processes 8x8 areas. The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = (h + 7) & ~7;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
 
+  // w >= 8
   do {
     int x_q4 = x0_q4;
-    x = 0;
+    uint8_t *d = dst;
+    int width = w;
+
     do {
-      uint8x8_t d[8];
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
 
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8];
-          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
-                      &s[5], &s[6], &s[7]);
-          transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
-                                         &s[5], &s[6], &s[7]);
-          d[0] = scale_filter_8(s, filters);
-          vst1_u8(&temp[8 * z], d[0]);
+          // Halve filter values (all even) to avoid the need for saturating
+          // arithmetic in convolution kernels.
+          const int16x8_t filter =
+              vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                                         &t7);
+
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+          vst1_u8(&temp[r * 8], d0);
         } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 8; ++c) {
+            temp[r * 8 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 8x8 filters values back to dst
-      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
-                  &d[7]);
-      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
-                                     &d[6], &d[7]);
-      store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
-                   d[6], d[7]);
-      x += 8;
-    } while (x < w);
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-static INLINE void scaledconvolve_vert_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
+      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      int16x4_t t[8], tt;
-
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
-      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
-      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
-      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
-      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
-      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
-      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
-      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
-      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-      store_u8_4x1(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
+      d += 8;
+      width -= 8;
+    } while (width != 0);
 
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
 }
 
-static INLINE void scaledconvolve_vert_w8(
+static INLINE void scaled_convolve_vert_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
   int y_q4 = y0_q4;
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      d = scale_filter_8(s, filters);
-      vst1_u8(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int16x8_t filter =
+            vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+        int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+        int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+        int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+        int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        // We halved the filter values so -1 from right shift.
+        uint8x8_t d0 =
+            vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
 
-static INLINE void scaledconvolve_vert_w16(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int x, y;
-  int y_q4 = y0_q4;
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
+  if (w == 8) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int16x8_t filter =
+            vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  // w >= 16
   do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
     if (y_q4 & SUBPEL_MASK) {
-      x = 0;
       do {
-        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-        uint8x16_t ss[8];
-        uint8x8_t s[8], d[2];
-        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
-                     &ss[5], &ss[6], &ss[7]);
-        s[0] = vget_low_u8(ss[0]);
-        s[1] = vget_low_u8(ss[1]);
-        s[2] = vget_low_u8(ss[2]);
-        s[3] = vget_low_u8(ss[3]);
-        s[4] = vget_low_u8(ss[4]);
-        s[5] = vget_low_u8(ss[5]);
-        s[6] = vget_low_u8(ss[6]);
-        s[7] = vget_low_u8(ss[7]);
-        d[0] = scale_filter_8(s, filters);
-
-        s[0] = vget_high_u8(ss[0]);
-        s[1] = vget_high_u8(ss[1]);
-        s[2] = vget_high_u8(ss[2]);
-        s[3] = vget_high_u8(ss[3]);
-        s[4] = vget_high_u8(ss[4]);
-        s[5] = vget_high_u8(ss[5]);
-        s[6] = vget_high_u8(ss[6]);
-        s[7] = vget_high_u8(ss[7]);
-        d[1] = scale_filter_8(s, filters);
-        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
-        src_y += 16;
-        x += 16;
-      } while (x < w);
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int16x8_t filter =
+            vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+        uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+        s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+        s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+        s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+        s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+        s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+        s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+        s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+        s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+        s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+        s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+        s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+        s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+        uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                   s6[0], s7[0], filter);
+        uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                   s6[1], s7[1], filter);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        vst1q_u8(d, s0);
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     }
-    dst += dst_stride;
+
     y_q4 += y_step_q4;
-  } while (--y);
+    dst += dst_stride;
+  } while (--h != 0);
 }
 
 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
   //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
+  // Deriving the maximum number of rows in the im_block buffer (135):
   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   // --Largest block size is 64x64 pixels.
   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
@@ -327,33 +311,25 @@ void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // When calling in frame scaling function, the smallest scaling factor is x1/4
   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
   // big enough.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
   assert(x_step_q4 <= 64);
 
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  }
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  }
+  scaled_convolve_horiz_neon(src - horiz_offset - vert_offset, src_stride,
+                             im_block, im_stride, filter, x0_q4, x_step_q4, w,
+                             im_height);
+
+  scaled_convolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+                            y_step_q4, w, h);
 }
-- 
GitLab


From a3f6185242f9a28f9e67ad99d2f80192fcc1ce74 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Sat, 15 Jun 2024 21:33:07 +0100
Subject: [PATCH 239/391] Add Arm Neon DotProd implementation of aom_scaled_2d

Add an Armv8.4 DotProd implementation of aom_scaled_2d and associated
tests.

Change-Id: Ia3267afc2c16655fda727d4efefd5e2c613db621
---
 aom_dsp/aom_dsp.cmake                         |   3 +-
 aom_dsp/aom_dsp_rtcd_defs.pl                  |   2 +-
 .../arm/aom_scaled_convolve8_neon_dotprod.c   | 359 ++++++++++++++++++
 test/convolve_test.cc                         |   6 +
 4 files changed, 368 insertions(+), 2 deletions(-)
 create mode 100644 aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index da49a6d972..c53624b95a 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -118,7 +118,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
-            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c"
+            "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ed81c58bab..421c8bd752 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -502,7 +502,7 @@ specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm ssse3/, "$avx2
 specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_2d ssse3 neon/;
+specialize qw/aom_scaled_2d ssse3 neon neon_dotprod/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..dd3431d883
--- /dev/null
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/aom_convolve8_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, int8x8_t filter) {
+  int8x16_t filter_x2 = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128)));
+  int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128)));
+
+  // Accumulate into 128 << (FILTER_BITS - 1) / 2 to account for range
+  // transform.
+  const int32x4_t acc = vdupq_n_s32((128 << (FILTER_BITS - 1)) / 2);
+  int32x4_t sum01 = vdotq_s32(acc, s01_128, filter_x2);
+  int32x4_t sum23 = vdotq_s32(acc, s23_128, filter_x2);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vdup_n_s16(0));
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  int8x16_t filter_x2 = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+  uint8x16_t s45 = vcombine_u8(s4, s5);
+  uint8x16_t s67 = vcombine_u8(s6, s7);
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128)));
+  int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128)));
+  int8x16_t s45_128 = vreinterpretq_s8_u8(vsubq_u8(s45, vdupq_n_u8(128)));
+  int8x16_t s67_128 = vreinterpretq_s8_u8(vsubq_u8(s67, vdupq_n_u8(128)));
+
+  // Accumulate into 128 << (FILTER_BITS - 1) / 2 to account for range
+  // transform.
+  const int32x4_t acc = vdupq_n_s32((128 << (FILTER_BITS - 1)) / 2);
+  int32x4_t sum01 = vdotq_s32(acc, s01_128, filter_x2);
+  int32x4_t sum23 = vdotq_s32(acc, s23_128, filter_x2);
+  int32x4_t sum45 = vdotq_s32(acc, s45_128, filter_x2);
+  int32x4_t sum67 = vdotq_s32(acc, s67_128, filter_x2);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int32x4_t sum4567 = vpaddq_s32(sum45, sum67);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void scaled_convolve_horiz_neon_dotprod(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+
+  if (w == 4) {
+    do {
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+        uint8x8_t s0, s1, s2, s3;
+        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_4_h(s0, s1, s2, s3, filter);
+
+        store_u8_4x1(&temp[4 * r], d0);
+
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
+
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
+
+  // w >= 8
+  do {
+    int x_q4 = x0_q4;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(&temp[r * 8], d0);
+
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
+}
+
+static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  uint8x16_t s01 = vcombine_u8(vzip1_u8(s0, s1), vdup_n_u8(0));
+  uint8x16_t s23 = vcombine_u8(vzip1_u8(s2, s3), vdup_n_u8(0));
+  uint8x16_t s45 = vcombine_u8(vzip1_u8(s4, s5), vdup_n_u8(0));
+  uint8x16_t s67 = vcombine_u8(vzip1_u8(s6, s7), vdup_n_u8(0));
+
+  uint8x16_t s0123 = vreinterpretq_u8_u16(
+      vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)));
+  uint8x16_t s4567 = vreinterpretq_u8_u16(
+      vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)));
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s0123_128 = vreinterpretq_s8_u8(vsubq_u8(s0123, vdupq_n_u8(128)));
+  int8x16_t s4567_128 = vreinterpretq_s8_u8(vsubq_u8(s4567, vdupq_n_u8(128)));
+
+  // Accumulate into 128 << (FILTER_BITS - 1) to account for range transform.
+  int32x4_t sum = vdupq_n_s32(128 << (FILTER_BITS - 1));
+  sum = vdotq_lane_s32(sum, s0123_128, filter, 0);
+  sum = vdotq_lane_s32(sum, s4567_128, filter, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(vcombine_s16(vmovn_s32(sum), vdup_n_s16(0)),
+                        FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  uint8x16_t s01 =
+      vzip1q_u8(vcombine_u8(s0, vdup_n_u8(0)), vcombine_u8(s1, vdup_n_u8(0)));
+  uint8x16_t s23 =
+      vzip1q_u8(vcombine_u8(s2, vdup_n_u8(0)), vcombine_u8(s3, vdup_n_u8(0)));
+  uint8x16_t s45 =
+      vzip1q_u8(vcombine_u8(s4, vdup_n_u8(0)), vcombine_u8(s5, vdup_n_u8(0)));
+  uint8x16_t s67 =
+      vzip1q_u8(vcombine_u8(s6, vdup_n_u8(0)), vcombine_u8(s7, vdup_n_u8(0)));
+
+  uint8x16_t s0123[2] = {
+    vreinterpretq_u8_u16(
+        vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))),
+    vreinterpretq_u8_u16(
+        vzip2q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)))
+  };
+  uint8x16_t s4567[2] = {
+    vreinterpretq_u8_u16(
+        vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))),
+    vreinterpretq_u8_u16(
+        vzip2q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)))
+  };
+
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t s0123_128[2] = {
+    vreinterpretq_s8_u8(vsubq_u8(s0123[0], vdupq_n_u8(128))),
+    vreinterpretq_s8_u8(vsubq_u8(s0123[1], vdupq_n_u8(128)))
+  };
+  int8x16_t s4567_128[2] = {
+    vreinterpretq_s8_u8(vsubq_u8(s4567[0], vdupq_n_u8(128))),
+    vreinterpretq_s8_u8(vsubq_u8(s4567[1], vdupq_n_u8(128)))
+  };
+
+  // Accumulate into 128 << (FILTER_BITS - 1) to account for range transform.
+  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, s0123_128[0], filter, 0);
+  sum0123 = vdotq_lane_s32(sum0123, s4567_128[0], filter, 1);
+
+  int32x4_t sum4567 = vdotq_lane_s32(acc, s0123_128[1], filter, 0);
+  sum4567 = vdotq_lane_s32(sum4567, s4567_128[1], filter, 1);
+
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void scaled_convolve_vert_neon_dotprod(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
+  int y_q4 = y0_q4;
+
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  // w >= 8
+  do {
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
+    if (y_q4 & SUBPEL_MASK) {
+      // Halve filter values (all even) to avoid the need for saturating
+      // arithmetic in convolution kernels.
+      const int8x8_t filter =
+          vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+      do {
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+    } else {
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x8_t s0 = vld1_u8(s);
+        vst1_u8(d, s0);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+    }
+
+    y_q4 += y_step_q4;
+    dst += dst_stride;
+  } while (--h != 0);
+}
+
+void aom_scaled_2d_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the im_block buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+  scaled_convolve_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+                                     src_stride, im_block, im_stride, filter,
+                                     x0_q4, x_step_q4, w, im_height);
+
+  scaled_convolve_vert_neon_dotprod(im_block, im_stride, dst, dst_stride,
+                                    filter, y0_q4, y_step_q4, w, h);
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 1895ec5108..3ae2219adf 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1124,6 +1124,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScale2DTest,
                          ::testing::Values(ALL_SIZES_64(aom_scaled_2d_neon)));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, ConvolveScale2DTest,
+    ::testing::Values(ALL_SIZES_64(aom_scaled_2d_neon_dotprod)));
+#endif  // HAVE_NEON_DOTPROD
+
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveScale2DTest,
                          ::testing::Values(ALL_SIZES_64(aom_scaled_2d_ssse3)));
-- 
GitLab


From afedaf9da5a13c372b8c7a645ab1bf18f80b56cd Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 17 Jun 2024 15:08:50 +0100
Subject: [PATCH 240/391] Add Arm Neon I8MM implementation of aom_scaled_2d

Add an Armv8.6 I8MM implementation of aom_scaled_2d and associated
tests.

Change-Id: I8ac85a7f53631d705e7fc0996520af86bf52ec8b
---
 aom_dsp/aom_dsp.cmake                        |   3 +-
 aom_dsp/aom_dsp_rtcd_defs.pl                 |   2 +-
 aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c | 324 +++++++++++++++++++
 test/convolve_test.cc                        |   6 +
 4 files changed, 333 insertions(+), 2 deletions(-)
 create mode 100644 aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index c53624b95a..1cf4a2efe3 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -122,7 +122,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
             "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
-            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c"
+            "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c")
 
 if(CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 421c8bd752..1a68f1b7a6 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -502,7 +502,7 @@ specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm ssse3/, "$avx2
 specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_2d ssse3 neon neon_dotprod/;
+specialize qw/aom_scaled_2d ssse3 neon neon_dotprod neon_i8mm/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..6bf4396b07
--- /dev/null
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/aom_convolve8_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, int8x8_t filter) {
+  int8x16_t filter_x2 = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+
+  int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2);
+  int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vdup_n_s16(0));
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  int8x16_t filter_x2 = vcombine_s8(filter, filter);
+
+  uint8x16_t s01 = vcombine_u8(s0, s1);
+  uint8x16_t s23 = vcombine_u8(s2, s3);
+  uint8x16_t s45 = vcombine_u8(s4, s5);
+  uint8x16_t s67 = vcombine_u8(s6, s7);
+
+  int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2);
+  int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2);
+  int32x4_t sum45 = vusdotq_s32(vdupq_n_s32(0), s45, filter_x2);
+  int32x4_t sum67 = vusdotq_s32(vdupq_n_s32(0), s67, filter_x2);
+
+  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
+  int32x4_t sum4567 = vpaddq_s32(sum45, sum67);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void scaled_convolve_horiz_neon_i8mm(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+
+  if (w == 4) {
+    do {
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+        uint8x8_t s0, s1, s2, s3;
+        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_4_h(s0, s1, s2, s3, filter);
+
+        store_u8_4x1(&temp[4 * r], d0);
+
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
+
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
+
+  // w >= 8
+  do {
+    int x_q4 = x0_q4;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
+
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(&temp[r * 8], d0);
+
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
+}
+
+static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  uint8x16_t s01 = vcombine_u8(vzip1_u8(s0, s1), vdup_n_u8(0));
+  uint8x16_t s23 = vcombine_u8(vzip1_u8(s2, s3), vdup_n_u8(0));
+  uint8x16_t s45 = vcombine_u8(vzip1_u8(s4, s5), vdup_n_u8(0));
+  uint8x16_t s67 = vcombine_u8(vzip1_u8(s6, s7), vdup_n_u8(0));
+
+  uint8x16_t s0123 = vreinterpretq_u8_u16(
+      vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)));
+  uint8x16_t s4567 = vreinterpretq_u8_u16(
+      vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)));
+
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0123, filter, 0);
+  sum = vusdotq_lane_s32(sum, s4567, filter, 1);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(vcombine_s16(vmovn_s32(sum), vdup_n_s16(0)),
+                        FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
+                                      uint8x8_t s6, uint8x8_t s7,
+                                      int8x8_t filter) {
+  uint8x16_t s01 =
+      vzip1q_u8(vcombine_u8(s0, vdup_n_u8(0)), vcombine_u8(s1, vdup_n_u8(0)));
+  uint8x16_t s23 =
+      vzip1q_u8(vcombine_u8(s2, vdup_n_u8(0)), vcombine_u8(s3, vdup_n_u8(0)));
+  uint8x16_t s45 =
+      vzip1q_u8(vcombine_u8(s4, vdup_n_u8(0)), vcombine_u8(s5, vdup_n_u8(0)));
+  uint8x16_t s67 =
+      vzip1q_u8(vcombine_u8(s6, vdup_n_u8(0)), vcombine_u8(s7, vdup_n_u8(0)));
+
+  uint8x16_t s0123[2] = {
+    vreinterpretq_u8_u16(
+        vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))),
+    vreinterpretq_u8_u16(
+        vzip2q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)))
+  };
+  uint8x16_t s4567[2] = {
+    vreinterpretq_u8_u16(
+        vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))),
+    vreinterpretq_u8_u16(
+        vzip2q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)))
+  };
+
+  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[0], filter, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, s4567[0], filter, 1);
+
+  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[1], filter, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, s4567[1], filter, 1);
+
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void scaled_convolve_vert_neon_i8mm(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
+  int y_q4 = y0_q4;
+
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        // Halve filter values (all even) to avoid the need for saturating
+        // arithmetic in convolution kernels.
+        const int8x8_t filter =
+            vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  // w >= 8
+  do {
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
+    if (y_q4 & SUBPEL_MASK) {
+      // Halve filter values (all even) to avoid the need for saturating
+      // arithmetic in convolution kernels.
+      const int8x8_t filter =
+          vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
+
+      do {
+        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        uint8x8_t d0 = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+    } else {
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x8_t s0 = vld1_u8(s);
+        vst1_u8(d, s0);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+    }
+
+    y_q4 += y_step_q4;
+    dst += dst_stride;
+  } while (--h != 0);
+}
+
+void aom_scaled_2d_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the im_block buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+  scaled_convolve_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                                  im_block, im_stride, filter, x0_q4, x_step_q4,
+                                  w, im_height);
+
+  scaled_convolve_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, filter,
+                                 y0_q4, y_step_q4, w, h);
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3ae2219adf..9d3fa452b2 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1130,6 +1130,12 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(ALL_SIZES_64(aom_scaled_2d_neon_dotprod)));
 #endif  // HAVE_NEON_DOTPROD
 
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+    NEON_I8MM, ConvolveScale2DTest,
+    ::testing::Values(ALL_SIZES_64(aom_scaled_2d_neon_i8mm)));
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveScale2DTest,
                          ::testing::Values(ALL_SIZES_64(aom_scaled_2d_ssse3)));
-- 
GitLab


From 869ca4eb6dcd6f22777c1d9a765582cbe57a3602 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 15:53:58 -0700
Subject: [PATCH 241/391] cmake: rm subtract*.c w/CONFIG_AV1_ENCODER=0

Fixes a -Wmissing-prototypes warning.

Bug: aomedia:3416
Change-Id: Ic1975d2efc104d48a638e606f1cfc32c2d0e1a60
---
 aom_dsp/aom_dsp.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 1cf4a2efe3..ff1c6de4aa 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -45,7 +45,6 @@ list(APPEND AOM_DSP_COMMON_SOURCES
             "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
             "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
             "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
-            "${AOM_ROOT}/aom_dsp/subtract.c"
             "${AOM_ROOT}/aom_dsp/txfm_common.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
 
@@ -113,7 +112,6 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
 
@@ -181,6 +179,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/quantize.h"
               "${AOM_ROOT}/aom_dsp/sad.c"
               "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/subtract.c"
               "${AOM_ROOT}/aom_dsp/sse.c"
               "${AOM_ROOT}/aom_dsp/ssim.c"
               "${AOM_ROOT}/aom_dsp/ssim.h"
@@ -291,6 +290,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
 
-- 
GitLab


From 49b4e03a80d896ddb827b3c7b51b5a2448dfc274 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 16:16:15 -0700
Subject: [PATCH 242/391] cmake: rm fft* w/CONFIG_AV1_ENCODER=0

This fixes a -Wmissing-prototypes warning.

Bug: aomedia:3416
Change-Id: Ibd48bcc202416ec24029d815a7ec7bbd34fbcc19
---
 aom_dsp/aom_dsp.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index ff1c6de4aa..05d0da1a19 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -29,8 +29,6 @@ list(APPEND AOM_DSP_COMMON_SOURCES
             "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
             "${AOM_ROOT}/aom_dsp/entcode.c"
             "${AOM_ROOT}/aom_dsp/entcode.h"
-            "${AOM_ROOT}/aom_dsp/fft.c"
-            "${AOM_ROOT}/aom_dsp/fft_common.h"
             "${AOM_ROOT}/aom_dsp/grain_params.h"
             "${AOM_ROOT}/aom_dsp/intrapred.c"
             "${AOM_ROOT}/aom_dsp/intrapred_common.h"
@@ -59,7 +57,6 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
@@ -94,7 +91,6 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
-            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
@@ -166,6 +162,8 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/blk_sse_sum.c"
               "${AOM_ROOT}/aom_dsp/entenc.c"
               "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fft.c"
+              "${AOM_ROOT}/aom_dsp/fft_common.h"
               "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
               "${AOM_ROOT}/aom_dsp/grain_table.c"
               "${AOM_ROOT}/aom_dsp/grain_table.h"
@@ -220,6 +218,7 @@ if(CONFIG_AV1_ENCODER)
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
               "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
@@ -240,6 +239,7 @@ if(CONFIG_AV1_ENCODER)
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
               "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
-- 
GitLab


From 57c8c9c8727b42b318ed09c518ba37703f95d848 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 17:02:25 -0700
Subject: [PATCH 243/391] cmake: rm highbd_adaptive_quantize_*
 w/CONFIG_REALTIME_ONLY=1

Fixes -Wmissing-prototypes warnings with this config.

Bug: aomedia_3416
Change-Id: I0335415ad5218970f3975f9d8692cd3a7405163e
---
 aom_dsp/aom_dsp.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 05d0da1a19..ef50d4d38f 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -364,6 +364,7 @@ if(CONFIG_AV1_ENCODER)
   if(CONFIG_REALTIME_ONLY)
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
                      "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
                      "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
                      "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c")
 
@@ -372,7 +373,8 @@ if(CONFIG_AV1_ENCODER)
                      "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
-                     "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+                     "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c")
 
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
                      "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
-- 
GitLab


From 479f178c521868198f01953e7207334c92d9ef1b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 17:41:50 -0700
Subject: [PATCH 244/391] cmake: rm temporal_filter*.c w/CONFIG_REALTIME_ONLY=1

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Iad8597845e86bbe138acaf756e043cf0eb155f1c
---
 av1/av1.cmake | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 1955255640..43af8f8450 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -530,15 +530,25 @@ if(CONFIG_INTERNAL_STATS)
 endif()
 
 if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
+                   "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c"
+                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c")
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1
                    "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c"
                    "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
+                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
                    "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
-                   "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c")
+                   "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
+                   "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c"
+                   "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c")
+
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
+                   "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
                    "${AOM_ROOT}/av1/encoder/cnn.c"
-- 
GitLab


From a09d9740cc5fac1b0693928de75cab30398c6a7d Mon Sep 17 00:00:00 2001
From: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Thu, 13 Jun 2024 10:54:33 +0300
Subject: [PATCH 245/391] Add Neon implementation for av1_convolve_horiz_rs

Add Neon implementation for av1_convolve_horiz_rs and the
corresponding tests as well.

Change-Id: Ia6d977ea90217546bca8d392b843e0bdb495895d
---
 av1/av1.cmake                               |   1 +
 av1/common/arm/av1_convolve_horiz_rs_neon.c | 180 ++++++++++++++++++++
 av1/common/av1_rtcd_defs.pl                 |   2 +-
 test/av1_horz_only_frame_superres_test.cc   |   5 +
 4 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 av1/common/arm/av1_convolve_horiz_rs_neon.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 43af8f8450..bed6ab9220 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -384,6 +384,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
             "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_convolve_horiz_rs_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
diff --git a/av1/common/arm/av1_convolve_horiz_rs_neon.c b/av1/common/arm/av1_convolve_horiz_rs_neon.c
new file mode 100644
index 0000000000..53353927a3
--- /dev/null
+++ b/av1/common/arm/av1_convolve_horiz_rs_neon.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/resize.h"
+
+static INLINE uint8x8_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+
+  return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void av1_convolve_horiz_rs_neon(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const int16_t *x_filter, int x0_qn,
+                                int x_step_qn) {
+  if ((w == 4 && h % 4 != 0) || (w % 8 == 0 && h % 8 != 0) || w % 8 != 0) {
+    av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, w, h, x_filter,
+                            x0_qn, x_step_qn);
+    return;
+  }
+
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  if (w == 4) {
+    do {
+      int x_qn = x0_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *const s = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            UPSCALE_NORMATIVE_TAPS *
+            ((x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS);
+        const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+        uint8x8_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        store_u8_4x1(&temp[r * 4], d0);
+
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
+
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int x_qn = x0_qn;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; ++r) {
+          const uint8_t *const s = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+
+          const ptrdiff_t filter_offset =
+              UPSCALE_NORMATIVE_TAPS *
+              ((x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS);
+          const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                                 &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+          vst1_u8(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+
+    } while (h > 0);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b982d86bcc..d08d2194d3 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -98,7 +98,7 @@ if ($opts{arch} eq "x86_64") {
 }
 
 add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
-specialize qw/av1_convolve_horiz_rs sse4_1/;
+specialize qw/av1_convolve_horiz_rs sse4_1 neon/;
 
 if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index a337211f64..d72ce1ea1b 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -301,6 +301,11 @@ TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
 INSTANTIATE_TEST_SUITE_P(C, LowBDConvolveHorizRSTest,
                          ::testing::Values(av1_convolve_horiz_rs_c));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LowBDConvolveHorizRSTest,
+                         ::testing::Values(av1_convolve_horiz_rs_neon));
+#endif
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest,
                          ::testing::Values(av1_convolve_horiz_rs_sse4_1));
-- 
GitLab


From e357367005ffa0bfba4a33d689e3b6ce34995f1c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 16:49:19 -0700
Subject: [PATCH 246/391] inspect.c: fix -Wdangling-pointer warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Untested.

examples/inspect.c|822 col 9| warning: storing the address of local
  variable 'ref_dec' in â€˜imgâ€™ [-Wdangling-pointer=]

Change-Id: I142e2d31c4c71410f71ed94806c45bf8a9739b47
---
 examples/inspect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inspect.c b/examples/inspect.c
index dca274f1ea..9b2aceaecb 100644
--- a/examples/inspect.c
+++ b/examples/inspect.c
@@ -773,6 +773,7 @@ int have_frame = 0;
 const unsigned char *frame;
 const unsigned char *end_frame;
 size_t frame_size = 0;
+struct av1_ref_frame ref_dec;
 
 EMSCRIPTEN_KEEPALIVE
 int read_frame(void) {
@@ -801,7 +802,6 @@ int read_frame(void) {
 
   int got_any_frames = 0;
   aom_image_t *frame_img;
-  struct av1_ref_frame ref_dec;
   ref_dec.idx = adr.idx;
 
   // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE
-- 
GitLab


From 54b7027954b752e2de58abe2873cfdf8a17d4160 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 16:50:33 -0700
Subject: [PATCH 247/391] inspect.c: make some functions static

This fixes some -Wmissing-prototypes warnings.

Since this is untested the unused function put_reference_frame() is left
for reference.

Bug: aomedia:3416
Change-Id: I7df1162f82434f9adca472b30cc0e27e871e7a99
---
 examples/inspect.c | 61 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/examples/inspect.c b/examples/inspect.c
index 9b2aceaecb..28d3f74ebb 100644
--- a/examples/inspect.c
+++ b/examples/inspect.c
@@ -280,7 +280,8 @@ struct parm_offset parm_offsets[] = {
 };
 int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]);
 
-int convert_to_indices(char *str, int *indices, int maxCount, int *count) {
+static int convert_to_indices(char *str, int *indices, int maxCount,
+                              int *count) {
   *count = 0;
   do {
     char *comma = strchr(str, ',');
@@ -307,7 +308,7 @@ AvxVideoReader *reader = NULL;
 const AvxVideoInfo *info = NULL;
 aom_image_t *img = NULL;
 
-void on_frame_decoded_dump(char *json) {
+static void on_frame_decoded_dump(char *json) {
 #ifdef __EMSCRIPTEN__
   EM_ASM_({ Module.on_frame_decoded_json($0); }, json);
 #else
@@ -317,7 +318,7 @@ void on_frame_decoded_dump(char *json) {
 
 // Writing out the JSON buffer using snprintf is very slow, especially when
 // compiled with emscripten, these functions speed things up quite a bit.
-int put_str(char *buffer, const char *str) {
+static int put_str(char *buffer, const char *str) {
   int i;
   for (i = 0; str[i] != '\0'; i++) {
     buffer[i] = str[i];
@@ -325,7 +326,7 @@ int put_str(char *buffer, const char *str) {
   return i;
 }
 
-int put_str_with_escape(char *buffer, const char *str) {
+static int put_str_with_escape(char *buffer, const char *str) {
   int i;
   int j = 0;
   for (i = 0; str[i] != '\0'; i++) {
@@ -339,7 +340,7 @@ int put_str_with_escape(char *buffer, const char *str) {
   return j;
 }
 
-int put_num(char *buffer, char prefix, int num, char suffix) {
+static int put_num(char *buffer, char prefix, int num, char suffix) {
   int i = 0;
   char *buf = buffer;
   int is_neg = 0;
@@ -376,7 +377,7 @@ int put_num(char *buffer, char prefix, int num, char suffix) {
   return i;
 }
 
-int put_map(char *buffer, const map_entry *map) {
+static int put_map(char *buffer, const map_entry *map) {
   char *buf = buffer;
   const map_entry *entry = map;
   while (entry->name != NULL) {
@@ -392,7 +393,8 @@ int put_map(char *buffer, const map_entry *map) {
   return (int)(buf - buffer);
 }
 
-int put_reference_frame(char *buffer) {
+#if 0
+static int put_reference_frame(char *buffer) {
   const int mi_rows = frame_data.mi_rows;
   const int mi_cols = frame_data.mi_cols;
   char *buf = buffer;
@@ -429,8 +431,9 @@ int put_reference_frame(char *buffer) {
   buf += put_str(buf, "],\n");
   return (int)(buf - buffer);
 }
+#endif
 
-int put_motion_vectors(char *buffer) {
+static int put_motion_vectors(char *buffer) {
   const int mi_rows = frame_data.mi_rows;
   const int mi_cols = frame_data.mi_cols;
   char *buf = buffer;
@@ -469,7 +472,7 @@ int put_motion_vectors(char *buffer) {
   return (int)(buf - buffer);
 }
 
-int put_combined(char *buffer) {
+static int put_combined(char *buffer) {
   const int mi_rows = frame_data.mi_rows;
   const int mi_cols = frame_data.mi_cols;
   char *buf = buffer;
@@ -501,8 +504,8 @@ int put_combined(char *buffer) {
   return (int)(buf - buffer);
 }
 
-int put_block_info(char *buffer, const map_entry *map, const char *name,
-                   size_t offset, int len) {
+static int put_block_info(char *buffer, const map_entry *map, const char *name,
+                          size_t offset, int len) {
   const int mi_rows = frame_data.mi_rows;
   const int mi_cols = frame_data.mi_cols;
   char *buf = buffer;
@@ -568,7 +571,7 @@ int put_block_info(char *buffer, const map_entry *map, const char *name,
 }
 
 #if CONFIG_ACCOUNTING
-int put_accounting(char *buffer) {
+static int put_accounting(char *buffer) {
   char *buf = buffer;
   int i;
   const Accounting *accounting = frame_data.accounting;
@@ -610,7 +613,7 @@ int put_accounting(char *buffer) {
 
 int skip_non_transform = 0;
 
-void inspect(void *pbi, void *data) {
+static void inspect(void *pbi, void *data) {
   /* Fetch frame data. */
   ifd_inspect(&frame_data, pbi, skip_non_transform);
 
@@ -742,13 +745,15 @@ void inspect(void *pbi, void *data) {
   aom_free(buffer);
 }
 
-void ifd_init_cb(void) {
+static void ifd_init_cb(void) {
   aom_inspect_init ii;
   ii.inspect_cb = inspect;
   ii.inspect_ctx = NULL;
   aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii);
 }
 
+EMSCRIPTEN_KEEPALIVE int open_file(char *file);
+
 EMSCRIPTEN_KEEPALIVE
 int open_file(char *file) {
   if (file == NULL) {
@@ -775,6 +780,8 @@ const unsigned char *end_frame;
 size_t frame_size = 0;
 struct av1_ref_frame ref_dec;
 
+EMSCRIPTEN_KEEPALIVE int read_frame(void);
+
 EMSCRIPTEN_KEEPALIVE
 int read_frame(void) {
   img = NULL;
@@ -823,35 +830,55 @@ int read_frame(void) {
   return EXIT_SUCCESS;
 }
 
+EMSCRIPTEN_KEEPALIVE const char *get_aom_codec_build_config(void);
+
 EMSCRIPTEN_KEEPALIVE
 const char *get_aom_codec_build_config(void) {
   return aom_codec_build_config();
 }
 
+EMSCRIPTEN_KEEPALIVE int get_bit_depth(void);
+
 EMSCRIPTEN_KEEPALIVE
 int get_bit_depth(void) { return img->bit_depth; }
 
+EMSCRIPTEN_KEEPALIVE int get_bits_per_sample(void);
+
 EMSCRIPTEN_KEEPALIVE
 int get_bits_per_sample(void) { return img->bps; }
 
+EMSCRIPTEN_KEEPALIVE int get_image_format(void);
+
 EMSCRIPTEN_KEEPALIVE
 int get_image_format(void) { return img->fmt; }
 
+EMSCRIPTEN_KEEPALIVE unsigned char *get_plane(int plane);
+
 EMSCRIPTEN_KEEPALIVE
 unsigned char *get_plane(int plane) { return img->planes[plane]; }
 
+EMSCRIPTEN_KEEPALIVE int get_plane_stride(int plane);
+
 EMSCRIPTEN_KEEPALIVE
 int get_plane_stride(int plane) { return img->stride[plane]; }
 
+EMSCRIPTEN_KEEPALIVE int get_plane_width(int plane);
+
 EMSCRIPTEN_KEEPALIVE
 int get_plane_width(int plane) { return aom_img_plane_width(img, plane); }
 
+EMSCRIPTEN_KEEPALIVE int get_plane_height(int plane);
+
 EMSCRIPTEN_KEEPALIVE
 int get_plane_height(int plane) { return aom_img_plane_height(img, plane); }
 
+EMSCRIPTEN_KEEPALIVE int get_frame_width(void);
+
 EMSCRIPTEN_KEEPALIVE
 int get_frame_width(void) { return info->frame_width; }
 
+EMSCRIPTEN_KEEPALIVE int get_frame_height(void);
+
 EMSCRIPTEN_KEEPALIVE
 int get_frame_height(void) { return info->frame_height; }
 
@@ -950,14 +977,20 @@ int main(int argc, char **argv) {
   }
 }
 
+EMSCRIPTEN_KEEPALIVE void quit(void);
+
 EMSCRIPTEN_KEEPALIVE
 void quit(void) {
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
   aom_video_reader_close(reader);
 }
 
+EMSCRIPTEN_KEEPALIVE void set_layers(LayerType v);
+
 EMSCRIPTEN_KEEPALIVE
 void set_layers(LayerType v) { layers = v; }
 
+EMSCRIPTEN_KEEPALIVE void set_compress(int v);
+
 EMSCRIPTEN_KEEPALIVE
 void set_compress(int v) { compress = v; }
-- 
GitLab


From 3076bbe33897ed332a12ca54d7addb3b9ce9d0ed Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 17:43:42 -0700
Subject: [PATCH 248/391] x86/*: exclude 1:4/4:1 fns w/CONFIG_REALTIME_ONLY

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I483d0e00ac42b7f9b87e81a5f2c6d5d56c6ea554
---
 aom_dsp/x86/highbd_sad_avx2.c              | 57 +++++++++++++++-------
 aom_dsp/x86/highbd_variance_sse2.c         | 36 ++++++++++++++
 aom_dsp/x86/masked_sad4d_ssse3.c           |  3 ++
 aom_dsp/x86/masked_sad_intrin_avx2.c       |  6 +++
 aom_dsp/x86/masked_sad_intrin_ssse3.c      |  6 +++
 aom_dsp/x86/masked_variance_intrin_ssse3.c |  6 +++
 aom_dsp/x86/sad4d_avx2.c                   | 14 ++++--
 aom_dsp/x86/variance_sse2.c                |  4 +-
 8 files changed, 108 insertions(+), 24 deletions(-)

diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index 8fb08b30ed..68bc928ecb 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -267,18 +267,14 @@ static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
                                           2 * ref_stride);                   \
   }
 
-HIGHBD_SADMXN_AVX2(16, 4)
 HIGHBD_SADMXN_AVX2(16, 8)
 HIGHBD_SADMXN_AVX2(16, 16)
 HIGHBD_SADMXN_AVX2(16, 32)
-HIGHBD_SADMXN_AVX2(16, 64)
 
-HIGHBD_SADMXN_AVX2(32, 8)
 HIGHBD_SADMXN_AVX2(32, 16)
 HIGHBD_SADMXN_AVX2(32, 32)
 HIGHBD_SADMXN_AVX2(32, 64)
 
-HIGHBD_SADMXN_AVX2(64, 16)
 HIGHBD_SADMXN_AVX2(64, 32)
 HIGHBD_SADMXN_AVX2(64, 64)
 HIGHBD_SADMXN_AVX2(64, 128)
@@ -286,17 +282,21 @@ HIGHBD_SADMXN_AVX2(64, 128)
 HIGHBD_SADMXN_AVX2(128, 64)
 HIGHBD_SADMXN_AVX2(128, 128)
 
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SADMXN_AVX2(16, 4)
+HIGHBD_SADMXN_AVX2(16, 64)
+HIGHBD_SADMXN_AVX2(32, 8)
+HIGHBD_SADMXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
 HIGHBD_SAD_SKIP_MXN_AVX2(16, 8)
 HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
 HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
-HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
 
-HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
 HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
 HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
 HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
 
-HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
 HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
 HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
 HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
@@ -304,6 +304,13 @@ HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
 HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
 HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
 
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
 unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
                                          const uint8_t *second_pred) {
@@ -315,6 +322,7 @@ unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
 
   return get_sad_from_mm256_epi32(&sad);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
@@ -362,6 +370,7 @@ unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
+#if !CONFIG_REALTIME_ONLY
 unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -395,6 +404,7 @@ unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
   }
   return get_sad_from_mm256_epi32(&sad);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
@@ -444,6 +454,7 @@ unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
+#if !CONFIG_REALTIME_ONLY
 unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -463,6 +474,7 @@ unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
   }
   return get_sad_from_mm256_epi32(&sad);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
@@ -663,18 +675,14 @@ static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
                              sad_array);                                      \
   }
 
-HIGHBD_SAD_MXNX4D_AVX2(16, 4)
 HIGHBD_SAD_MXNX4D_AVX2(16, 8)
 HIGHBD_SAD_MXNX4D_AVX2(16, 16)
 HIGHBD_SAD_MXNX4D_AVX2(16, 32)
-HIGHBD_SAD_MXNX4D_AVX2(16, 64)
 
-HIGHBD_SAD_MXNX4D_AVX2(32, 8)
 HIGHBD_SAD_MXNX4D_AVX2(32, 16)
 HIGHBD_SAD_MXNX4D_AVX2(32, 32)
 HIGHBD_SAD_MXNX4D_AVX2(32, 64)
 
-HIGHBD_SAD_MXNX4D_AVX2(64, 16)
 HIGHBD_SAD_MXNX4D_AVX2(64, 32)
 HIGHBD_SAD_MXNX4D_AVX2(64, 64)
 HIGHBD_SAD_MXNX4D_AVX2(64, 128)
@@ -682,17 +690,21 @@ HIGHBD_SAD_MXNX4D_AVX2(64, 128)
 HIGHBD_SAD_MXNX4D_AVX2(128, 64)
 HIGHBD_SAD_MXNX4D_AVX2(128, 128)
 
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX4D_AVX2(16, 4)
+HIGHBD_SAD_MXNX4D_AVX2(16, 64)
+HIGHBD_SAD_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_MXNX4D_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
-HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
 
-HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
 
-HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
@@ -700,21 +712,30 @@ HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
 
-HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
 HIGHBD_SAD_MXNX3D_AVX2(16, 8)
 HIGHBD_SAD_MXNX3D_AVX2(16, 16)
 HIGHBD_SAD_MXNX3D_AVX2(16, 32)
-HIGHBD_SAD_MXNX3D_AVX2(16, 64)
 
-HIGHBD_SAD_MXNX3D_AVX2(32, 8)
 HIGHBD_SAD_MXNX3D_AVX2(32, 16)
 HIGHBD_SAD_MXNX3D_AVX2(32, 32)
 HIGHBD_SAD_MXNX3D_AVX2(32, 64)
 
-HIGHBD_SAD_MXNX3D_AVX2(64, 16)
 HIGHBD_SAD_MXNX3D_AVX2(64, 32)
 HIGHBD_SAD_MXNX3D_AVX2(64, 64)
 HIGHBD_SAD_MXNX3D_AVX2(64, 128)
 
 HIGHBD_SAD_MXNX3D_AVX2(128, 64)
 HIGHBD_SAD_MXNX3D_AVX2(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 676208bfcf..ca45c324fd 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -152,10 +152,13 @@ VAR_FN(16, 16, 16, 8)
 VAR_FN(16, 8, 8, 7)
 VAR_FN(8, 16, 8, 7)
 VAR_FN(8, 8, 8, 6)
+
+#if !CONFIG_REALTIME_ONLY
 VAR_FN(8, 32, 8, 8)
 VAR_FN(32, 8, 8, 8)
 VAR_FN(16, 64, 16, 10)
 VAR_FN(64, 16, 16, 10)
+#endif  // !CONFIG_REALTIME_ONLY
 
 #undef VAR_FN
 
@@ -382,6 +385,23 @@ DECLS(sse2)
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
+#if CONFIG_REALTIME_ONLY
+#define FNS(opt)                         \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t))   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
+#else  // !CONFIG_REALTIME_ONLY
 #define FNS(opt)                         \
   FN(128, 128, 16, 7, 7, opt, (int64_t)) \
   FN(128, 64, 16, 7, 6, opt, (int64_t))  \
@@ -402,6 +422,7 @@ DECLS(sse2)
   FN(32, 8, 16, 5, 3, opt, (int64_t))    \
   FN(16, 64, 16, 4, 6, opt, (int64_t))   \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
+#endif  // CONFIG_REALTIME_ONLY
 
 FNS(sse2)
 
@@ -549,6 +570,20 @@ DECLS(sse2)
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
+#if CONFIG_REALTIME_ONLY
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
+#else  // !CONFIG_REALTIME_ONLY
 #define FNS(opt)                       \
   FN(64, 64, 16, 6, 6, opt, (int64_t)) \
   FN(64, 32, 16, 6, 5, opt, (int64_t)) \
@@ -566,6 +601,7 @@ DECLS(sse2)
   FN(32, 8, 16, 5, 3, opt, (int64_t))  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)) \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
+#endif  // CONFIG_REALTIME_ONLY
 
 FNS(sse2)
 
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index d2181a5a97..08847c8154 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -260,9 +260,12 @@ MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
+
+#if !CONFIG_REALTIME_ONLY
 MASKSAD4XN_SSSE3(16)
 MASKSADMXN_SSSE3(16, 4)
 MASKSAD8XN_SSSE3(32)
 MASKSADMXN_SSSE3(32, 8)
 MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index 8800af7a46..a8097bf4a1 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -192,12 +192,15 @@ MASKSADMXN_AVX2(64, 64)
 MASKSADMXN_AVX2(64, 128)
 MASKSADMXN_AVX2(128, 64)
 MASKSADMXN_AVX2(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
 MASKSADMXN_AVX2(4, 16)
 MASKSADMXN_AVX2(16, 4)
 MASKSADMXN_AVX2(8, 32)
 MASKSADMXN_AVX2(32, 8)
 MASKSADMXN_AVX2(16, 64)
 MASKSADMXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE unsigned int highbd_masked_sad8xh_avx2(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -381,9 +384,12 @@ HIGHBD_MASKSADMXN_AVX2(64, 64)
 HIGHBD_MASKSADMXN_AVX2(64, 128)
 HIGHBD_MASKSADMXN_AVX2(128, 64)
 HIGHBD_MASKSADMXN_AVX2(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASKSADMXN_AVX2(4, 16)
 HIGHBD_MASKSADMXN_AVX2(16, 4)
 HIGHBD_MASKSADMXN_AVX2(8, 32)
 HIGHBD_MASKSADMXN_AVX2(32, 8)
 HIGHBD_MASKSADMXN_AVX2(16, 64)
 HIGHBD_MASKSADMXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 0c75a8be92..ee56d33272 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -84,12 +84,15 @@ MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
+
+#if !CONFIG_REALTIME_ONLY
 MASKSAD4XN_SSSE3(16)
 MASKSADMXN_SSSE3(16, 4)
 MASKSAD8XN_SSSE3(32)
 MASKSADMXN_SSSE3(32, 8)
 MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
@@ -275,12 +278,15 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)
 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)
+
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASKSAD4XN_SSSE3(16)
 HIGHBD_MASKSADMXN_SSSE3(16, 4)
 HIGHBD_MASKSADMXN_SSSE3(8, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 8)
 HIGHBD_MASKSADMXN_SSSE3(16, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index e23faef7ad..81c40cdfc0 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -126,12 +126,15 @@ MASK_SUBPIX_VAR8XH_SSSE3(8)
 MASK_SUBPIX_VAR8XH_SSSE3(4)
 MASK_SUBPIX_VAR4XH_SSSE3(8)
 MASK_SUBPIX_VAR4XH_SSSE3(4)
+
+#if !CONFIG_REALTIME_ONLY
 MASK_SUBPIX_VAR4XH_SSSE3(16)
 MASK_SUBPIX_VAR_SSSE3(16, 4)
 MASK_SUBPIX_VAR8XH_SSSE3(32)
 MASK_SUBPIX_VAR_SSSE3(32, 8)
 MASK_SUBPIX_VAR_SSSE3(64, 16)
 MASK_SUBPIX_VAR_SSSE3(16, 64)
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE __m128i filter_block(const __m128i a, const __m128i b,
                                    const __m128i filter) {
@@ -704,12 +707,15 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
                                           const __m128i filter) {
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 46fc1747b4..324b142232 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -142,12 +142,10 @@ static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
     aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
 
-SADMXN_AVX2(32, 8)
 SADMXN_AVX2(32, 16)
 SADMXN_AVX2(32, 32)
 SADMXN_AVX2(32, 64)
 
-SADMXN_AVX2(64, 16)
 SADMXN_AVX2(64, 32)
 SADMXN_AVX2(64, 64)
 SADMXN_AVX2(64, 128)
@@ -155,6 +153,11 @@ SADMXN_AVX2(64, 128)
 SADMXN_AVX2(128, 64)
 SADMXN_AVX2(128, 128)
 
+#if !CONFIG_REALTIME_ONLY
+SADMXN_AVX2(32, 8)
+SADMXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
 #define SAD_SKIP_MXN_AVX2(m, n)                                             \
   void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
                                         const uint8_t *const ref[4],        \
@@ -167,12 +170,10 @@ SADMXN_AVX2(128, 128)
     res[3] <<= 1;                                                           \
   }
 
-SAD_SKIP_MXN_AVX2(32, 8)
 SAD_SKIP_MXN_AVX2(32, 16)
 SAD_SKIP_MXN_AVX2(32, 32)
 SAD_SKIP_MXN_AVX2(32, 64)
 
-SAD_SKIP_MXN_AVX2(64, 16)
 SAD_SKIP_MXN_AVX2(64, 32)
 SAD_SKIP_MXN_AVX2(64, 64)
 SAD_SKIP_MXN_AVX2(64, 128)
@@ -180,6 +181,11 @@ SAD_SKIP_MXN_AVX2(64, 128)
 SAD_SKIP_MXN_AVX2(128, 64)
 SAD_SKIP_MXN_AVX2(128, 128)
 
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_MXN_AVX2(32, 8)
+SAD_SKIP_MXN_AVX2(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
 static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
                                                  int src_stride,
                                                  const uint8_t *const ref[4],
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 5f3f899383..25143a23d9 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -321,7 +321,6 @@ void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride,
 
 AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128)
 AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128)
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
 
 AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128)
 AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128)
@@ -331,13 +330,14 @@ AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128)
 AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256)
 AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512)
 
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
 AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512)
 AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024)
 
 #if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
 AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128)
 AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
 AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
 #endif
 
-- 
GitLab


From 5c197365b870443799eac6bfa5ffbed1ba30fa35 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Jun 2024 17:30:11 -0700
Subject: [PATCH 249/391] SVT-AV1: quiet some unaligned load warnings

This is a follow up to:
dc7f14f570 AVX2: Fix a couple unaligned load warnings

This quiets warnings of the form:
third_party/SVT-AV1/convolve_avx2.h:1104:31: runtime error: load of
  misaligned address 0x00000c326966 for type 'int32_t' (aka 'int'),
  which requires 4 byte alignment
third_party/SVT-AV1/EbMemory_SSE4_1.h:24:39: runtime error: load of
  misaligned address 0x00000a843536 for type 'int32_t' (aka 'int'),
  which requires 4 byte alignment

Bug: b:300649160
Change-Id: I54950b2dfa0a0aa93c03a7c99ca21ec3120b510c
---
 third_party/SVT-AV1/EbMemory_SSE4_1.h |  5 +++--
 third_party/SVT-AV1/README.libaom     |  2 ++
 third_party/SVT-AV1/convolve_avx2.h   | 30 +++++++++++++--------------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/third_party/SVT-AV1/EbMemory_SSE4_1.h b/third_party/SVT-AV1/EbMemory_SSE4_1.h
index d821d9a307..8c51673f12 100644
--- a/third_party/SVT-AV1/EbMemory_SSE4_1.h
+++ b/third_party/SVT-AV1/EbMemory_SSE4_1.h
@@ -18,11 +18,12 @@
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
 
 static INLINE __m128i load8bit_4x2_sse4_1(const void *const src,
                                           const ptrdiff_t strideInByte) {
-  const __m128i s = _mm_cvtsi32_si128(*(int32_t *)((uint8_t *)src));
-  return _mm_insert_epi32(s, *(int32_t *)((uint8_t *)src + strideInByte), 1);
+  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
+  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
 }
 
 static INLINE __m128i load_u8_4x2_sse4_1(const uint8_t *const src,
diff --git a/third_party/SVT-AV1/README.libaom b/third_party/SVT-AV1/README.libaom
index ff365057eb..1dadd49d56 100644
--- a/third_party/SVT-AV1/README.libaom
+++ b/third_party/SVT-AV1/README.libaom
@@ -12,3 +12,5 @@ Only ported the functions pertinent to single reference convolves.
 All functions are made static inline to avoid function call overheads.
 References to some arrays are changed to libaom version when applicable.
 Some extra intrinsic functions are added to support missing block sizes.
+Unaligned integer loads are changed to use functions from
+aom_dsp/x86/mem_sse2.h.
diff --git a/third_party/SVT-AV1/convolve_avx2.h b/third_party/SVT-AV1/convolve_avx2.h
index 923cabee7f..da7e7c091f 100644
--- a/third_party/SVT-AV1/convolve_avx2.h
+++ b/third_party/SVT-AV1/convolve_avx2.h
@@ -61,7 +61,7 @@ static INLINE void prepare_half_coeffs_2tap_ssse3(
     __m128i *const coeffs /* [1] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
       filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
 
   // right shift all filter co-efficients by 1 to reduce the bits required.
   // This extra right shift will be taken care of at the end while rounding
@@ -234,7 +234,7 @@ static INLINE void prepare_coeffs_2tap_sse2(
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
       filter_params, subpel_q4 & SUBPEL_MASK);
 
-  const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
 
   // coeffs 3 4 3 4 3 4 3 4
   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
@@ -293,7 +293,7 @@ static INLINE void prepare_coeffs_2tap_avx2(
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
       filter_params, subpel_q4 & SUBPEL_MASK);
 
-  const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
 
   // coeffs 3 4 3 4 3 4 3 4
@@ -1101,9 +1101,9 @@ static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
                                                 __m128i s_32[2]) {
   __m128i s_128[2];
 
-  s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
+  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
-  s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
   const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
   return convolve_2tap_ssse3(&ss, coeffs);
@@ -1575,9 +1575,9 @@ static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[4],
                                                   __m128i ss_128[2],
                                                   const __m128i coeffs[2]) {
-  s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
+  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
   const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
-  s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
+  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
   const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
   ss_128[1] = _mm_unpacklo_epi16(src23, src34);
   const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
@@ -1694,9 +1694,9 @@ static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[6],
                                                   __m128i ss_128[3],
                                                   const __m128i coeffs[3]) {
-  s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
+  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
   const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
-  s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
+  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
   const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
   ss_128[2] = _mm_unpacklo_epi16(src45, src56);
   const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
@@ -1820,9 +1820,9 @@ static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[8],
                                                   __m128i ss_128[4],
                                                   const __m128i coeffs[4]) {
-  s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
+  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
   const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
-  s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
+  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
   const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
   ss_128[3] = _mm_unpacklo_epi16(src67, src78);
   const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
@@ -2040,7 +2040,7 @@ static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
         } else if (w == 4) {
           __m128i s_32[2];
 
-          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 
           do {
             const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
@@ -2190,13 +2190,13 @@ static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
         } else if (w == 4) {
           __m128i s_32[2];
 
-          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 
           do {
-            s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + src_stride));
+            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
             const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
             xx_storel_32(dst, d0);
-            s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
+            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
             const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
             xx_storel_32(dst + dst_stride, d1);
             src_ptr += 2 * src_stride;
-- 
GitLab


From 34b0b9b2bb75ef9321bb9b3f193d4e453cdf6d95 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 18 Jun 2024 18:22:53 -0700
Subject: [PATCH 250/391] add missing CONFIG_AV1_HIGHBITDEPTH checks

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: I86180d51b0bec015a2bd76bfa3f301ca3a47a40f
---
 aom_dsp/intrapred.c     | 7 +++++++
 aom_dsp/variance.c      | 2 ++
 av1/common/convolve.c   | 2 ++
 av1/common/reconinter.c | 2 ++
 av1/encoder/pickrst.c   | 8 ++++++++
 5 files changed, 21 insertions(+)

diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 8a9dbae083..a1c48ccb63 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -360,6 +360,8 @@ void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
 #undef DC_MULTIPLIER_1X2
 #undef DC_MULTIPLIER_1X4
 
+#if CONFIG_AV1_HIGHBITDEPTH
+
 static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {
@@ -704,6 +706,7 @@ void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
 
 #undef HIGHBD_DC_MULTIPLIER_1X2
 #undef HIGHBD_DC_MULTIPLIER_1X4
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // This serves as a wrapper function, so that all the prediction functions
 // can be unified and accessed as a pointer array. Note that the boundary
@@ -715,12 +718,16 @@ void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
     type##_predictor(dst, stride, width, height, above, left); \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #define intra_pred_highbd_sized(type, width, height)                        \
   void aom_highbd_##type##_predictor_##width##x##height##_c(                \
       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
       const uint16_t *left, int bd) {                                       \
     highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
   }
+#else  // !CONFIG_AV1_HIGHBITDEPTH
+#define intra_pred_highbd_sized(type, width, height)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* clang-format off */
 #define intra_pred_rectangular(type) \
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 27587cd1fb..7d64b6300f 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -1216,6 +1216,7 @@ uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
                                     int sstride, int w, int h) {
   uint64_t sum = 0;
@@ -1227,3 +1228,4 @@ uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
   }
   return sum;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 3203e31c8a..92c2e2a0df 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -48,6 +48,7 @@ void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const int16_t *x_filters, int x0_qn,
@@ -72,6 +73,7 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
     dst += dst_stride;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index d4d5f9848e..418ed75ba5 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -358,6 +358,7 @@ void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_FORCE_INLINE void diffwtd_mask_highbd(
     uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
     int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
@@ -437,6 +438,7 @@ void av1_build_compound_diffwtd_mask_highbd_c(
     default: assert(0);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static AOM_INLINE void init_wedge_master_masks(void) {
   int i, j;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index d35848fa59..3bb6e6ba6a 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -498,6 +498,7 @@ static AOM_INLINE void calc_proj_params_r0_r1_c(
   C[1] /= size;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -526,6 +527,7 @@ static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
   C[0] /= size;
   C[1] /= size;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
                                              int height, int src_stride,
@@ -550,6 +552,7 @@ static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
   C[0] /= size;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_INLINE void calc_proj_params_r0_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -570,6 +573,7 @@ static AOM_INLINE void calc_proj_params_r0_high_bd_c(
   H[0][0] /= size;
   C[0] /= size;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
                                              int height, int src_stride,
@@ -594,6 +598,7 @@ static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
   C[1] /= size;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_INLINE void calc_proj_params_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
@@ -614,6 +619,7 @@ static AOM_INLINE void calc_proj_params_r1_high_bd_c(
   H[1][1] /= size;
   C[1] /= size;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // The function calls 3 subfunctions for the following cases :
 // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
@@ -639,6 +645,7 @@ void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
                                     int dat_stride, int32_t *flt0,
@@ -658,6 +665,7 @@ void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
                                   dat_stride, flt1, flt1_stride, H, C);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
                                          int height, int src_stride,
-- 
GitLab


From 52ba0aed29cccb1560d9dad41eefeede0ef28565 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 25 Jun 2024 14:28:21 -0700
Subject: [PATCH 251/391] Fix integer overflows caused by max target bitrate

Fix signed integer overflows caused by setting the targer bitrate to the
maximum value (2000000).

Tested: Build libaom with -DSANITIZE=undefined and the run
./aomenc husky.yuv -o AV1_husky_2000000_10000000_10000000.webm --good \
  --cpu-used=2 -v -t 0 -w 352 -h 288 --fps=10000000/10000000 \
  --target-bitrate=2000000 --limit=150 --test-decode=fatal --passes=2 \
  --lag-in-frames=35 --min-q=0 --max-q=63 --min-gf-interval=4 \
  --max-gf-interval=32 --arnr-maxframes=7 --arnr-strength=5 \
  --kf-max-dist=9999 --aq-mode=0 --undershoot-pct=100 \
  --overshoot-pct=100 --bias-pct=50

Fix the signed integer overflows strictly in the order they are reported
by UBSan, because a later overflow with a negative operand may have been
caused by an earlier overflow.

Bug: 349440066
Change-Id: Ideee322cc38f251461f67676258bb4bc4ab867f1
---
 av1/encoder/pass2_strategy.c | 12 ++++++++++--
 av1/encoder/ratectrl.c       |  8 ++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 619b2de81a..a048901711 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -946,7 +946,10 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group,
       case ARF_UPDATE:
       case INTNL_ARF_UPDATE:
         arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
-        gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+        gf_group->bit_allocation[idx] =
+            (base_frame_bits > INT_MAX - arf_extra_bits)
+                ? INT_MAX
+                : (base_frame_bits + arf_extra_bits);
         break;
       case INTNL_OVERLAY_UPDATE:
       case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
@@ -4136,7 +4139,12 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
   }
 
   // Target vs actual bits for this arf group.
-  twopass->rolling_arf_group_target_bits += rc->base_frame_target;
+  if (twopass->rolling_arf_group_target_bits >
+      INT_MAX - rc->base_frame_target) {
+    twopass->rolling_arf_group_target_bits = INT_MAX;
+  } else {
+    twopass->rolling_arf_group_target_bits += rc->base_frame_target;
+  }
   twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
 
   // Calculate the pct rc error.
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index a34ce78757..cd73772b46 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2229,8 +2229,8 @@ void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
     const int tolerance = (int)AOMMAX(
         100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
     *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
-    *frame_over_shoot_limit =
-        AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
+    *frame_over_shoot_limit = (int)AOMMIN((int64_t)frame_target + tolerance,
+                                          cpi->rc.max_frame_bandwidth);
   }
 }
 
@@ -2355,9 +2355,9 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
                                                      cm->width, cm->height));
   if (current_frame->frame_type != KEY_FRAME) {
     p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
-        p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+        (int64_t)p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
     p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
-        p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+        (int64_t)p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
   }
 
   // Actual bits spent
-- 
GitLab


From 9759ad181d0b67450c8ed25fcbca9a20da5508e5 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 25 Jun 2024 16:22:51 -0700
Subject: [PATCH 252/391] Fix unsigned int overflow in init_rate_histogram()

Tested: Build libaom with -DSANITIZE=integer and the run
./aomenc husky.yuv -o AV1_husky_2000000_1000000_1000000.webm --good \
  --cpu-used=2 -v -t 0 -w 352 -h 288 --fps=1000000/1000000 \
  --target-bitrate=2000000 --limit=150 --test-decode=fatal --passes=2 \
  --lag-in-frames=35 --min-q=0 --max-q=63 --min-gf-interval=4 \
  --max-gf-interval=32 --arnr-maxframes=7 --arnr-strength=5 \
  --kf-max-dist=9999 --aq-mode=0 --undershoot-pct=100 \
  --overshoot-pct=100 --bias-pct=50

This unsigned integer overflow seems to be caused by
g_timebase.num=1000000.

Bug: 349440066
Change-Id: I0339674d6283b8875087ba6dc6564b79ffeb090b
---
 stats/rate_hist.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/stats/rate_hist.c b/stats/rate_hist.c
index 1b4f2ee001..32872465fe 100644
--- a/stats/rate_hist.c
+++ b/stats/rate_hist.c
@@ -12,10 +12,11 @@
 #include "stats/rate_hist.h"
 
 #include <assert.h>
-#include <stdlib.h>
+#include <math.h>
 #include <limits.h>
+#include <stdint.h>
 #include <stdio.h>
-#include <math.h>
+#include <stdlib.h>
 
 #define RATE_BINS 100
 #define HIST_BAR_MAX 40
@@ -48,7 +49,8 @@ struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
   // Determine the number of samples in the buffer. Use the file's framerate
   // to determine the number of frames in rc_buf_sz milliseconds, with an
   // adjustment (5/4) to account for alt-refs
-  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+  hist->samples =
+      (int)((int64_t)cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000);
 
   // prevent division by zero
   if (hist->samples == 0) hist->samples = 1;
-- 
GitLab


From 76c95cfe2519c1718786f62a610054094ba64d06 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 24 Jun 2024 16:47:41 -0700
Subject: [PATCH 253/391] Mark several arrays as static const

Change-Id: I381c6c06a255ec4d8ea46baf1a36bc61db8f5c52
---
 apps/aomenc.c | 130 ++++++++++++++++++++++++++------------------------
 1 file changed, 68 insertions(+), 62 deletions(-)

diff --git a/apps/aomenc.c b/apps/aomenc.c
index 555c6d8839..00461e09a1 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -244,33 +244,35 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_RATE_DISTRIBUTION_INFO,
                                         0 };
 
-const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help,
-                                 &g_av1_codec_arg_defs.use_cfg,
-                                 &g_av1_codec_arg_defs.debugmode,
-                                 &g_av1_codec_arg_defs.outputfile,
-                                 &g_av1_codec_arg_defs.codecarg,
-                                 &g_av1_codec_arg_defs.passes,
-                                 &g_av1_codec_arg_defs.pass_arg,
-                                 &g_av1_codec_arg_defs.fpf_name,
-                                 &g_av1_codec_arg_defs.limit,
-                                 &g_av1_codec_arg_defs.skip,
-                                 &g_av1_codec_arg_defs.good_dl,
-                                 &g_av1_codec_arg_defs.rt_dl,
-                                 &g_av1_codec_arg_defs.ai_dl,
-                                 &g_av1_codec_arg_defs.quietarg,
-                                 &g_av1_codec_arg_defs.verbosearg,
-                                 &g_av1_codec_arg_defs.psnrarg,
-                                 &g_av1_codec_arg_defs.use_webm,
-                                 &g_av1_codec_arg_defs.use_ivf,
-                                 &g_av1_codec_arg_defs.use_obu,
-                                 &g_av1_codec_arg_defs.q_hist_n,
-                                 &g_av1_codec_arg_defs.rate_hist_n,
-                                 &g_av1_codec_arg_defs.disable_warnings,
-                                 &g_av1_codec_arg_defs.disable_warning_prompt,
-                                 &g_av1_codec_arg_defs.recontest,
-                                 NULL };
-
-const arg_def_t *global_args[] = {
+static const arg_def_t *const main_args[] = {
+  &g_av1_codec_arg_defs.help,
+  &g_av1_codec_arg_defs.use_cfg,
+  &g_av1_codec_arg_defs.debugmode,
+  &g_av1_codec_arg_defs.outputfile,
+  &g_av1_codec_arg_defs.codecarg,
+  &g_av1_codec_arg_defs.passes,
+  &g_av1_codec_arg_defs.pass_arg,
+  &g_av1_codec_arg_defs.fpf_name,
+  &g_av1_codec_arg_defs.limit,
+  &g_av1_codec_arg_defs.skip,
+  &g_av1_codec_arg_defs.good_dl,
+  &g_av1_codec_arg_defs.rt_dl,
+  &g_av1_codec_arg_defs.ai_dl,
+  &g_av1_codec_arg_defs.quietarg,
+  &g_av1_codec_arg_defs.verbosearg,
+  &g_av1_codec_arg_defs.psnrarg,
+  &g_av1_codec_arg_defs.use_webm,
+  &g_av1_codec_arg_defs.use_ivf,
+  &g_av1_codec_arg_defs.use_obu,
+  &g_av1_codec_arg_defs.q_hist_n,
+  &g_av1_codec_arg_defs.rate_hist_n,
+  &g_av1_codec_arg_defs.disable_warnings,
+  &g_av1_codec_arg_defs.disable_warning_prompt,
+  &g_av1_codec_arg_defs.recontest,
+  NULL
+};
+
+static const arg_def_t *const global_args[] = {
   &g_av1_codec_arg_defs.use_nv12,
   &g_av1_codec_arg_defs.use_yv12,
   &g_av1_codec_arg_defs.use_i420,
@@ -300,40 +302,44 @@ const arg_def_t *global_args[] = {
   NULL
 };
 
-const arg_def_t *rc_args[] = { &g_av1_codec_arg_defs.dropframe_thresh,
-                               &g_av1_codec_arg_defs.resize_mode,
-                               &g_av1_codec_arg_defs.resize_denominator,
-                               &g_av1_codec_arg_defs.resize_kf_denominator,
-                               &g_av1_codec_arg_defs.superres_mode,
-                               &g_av1_codec_arg_defs.superres_denominator,
-                               &g_av1_codec_arg_defs.superres_kf_denominator,
-                               &g_av1_codec_arg_defs.superres_qthresh,
-                               &g_av1_codec_arg_defs.superres_kf_qthresh,
-                               &g_av1_codec_arg_defs.end_usage,
-                               &g_av1_codec_arg_defs.target_bitrate,
-                               &g_av1_codec_arg_defs.min_quantizer,
-                               &g_av1_codec_arg_defs.max_quantizer,
-                               &g_av1_codec_arg_defs.undershoot_pct,
-                               &g_av1_codec_arg_defs.overshoot_pct,
-                               &g_av1_codec_arg_defs.buf_sz,
-                               &g_av1_codec_arg_defs.buf_initial_sz,
-                               &g_av1_codec_arg_defs.buf_optimal_sz,
-                               &g_av1_codec_arg_defs.bias_pct,
-                               &g_av1_codec_arg_defs.minsection_pct,
-                               &g_av1_codec_arg_defs.maxsection_pct,
-                               NULL };
-
-const arg_def_t *kf_args[] = { &g_av1_codec_arg_defs.fwd_kf_enabled,
-                               &g_av1_codec_arg_defs.kf_min_dist,
-                               &g_av1_codec_arg_defs.kf_max_dist,
-                               &g_av1_codec_arg_defs.kf_disabled,
-                               &g_av1_codec_arg_defs.sframe_dist,
-                               &g_av1_codec_arg_defs.sframe_mode,
-                               NULL };
+static const arg_def_t *const rc_args[] = {
+  &g_av1_codec_arg_defs.dropframe_thresh,
+  &g_av1_codec_arg_defs.resize_mode,
+  &g_av1_codec_arg_defs.resize_denominator,
+  &g_av1_codec_arg_defs.resize_kf_denominator,
+  &g_av1_codec_arg_defs.superres_mode,
+  &g_av1_codec_arg_defs.superres_denominator,
+  &g_av1_codec_arg_defs.superres_kf_denominator,
+  &g_av1_codec_arg_defs.superres_qthresh,
+  &g_av1_codec_arg_defs.superres_kf_qthresh,
+  &g_av1_codec_arg_defs.end_usage,
+  &g_av1_codec_arg_defs.target_bitrate,
+  &g_av1_codec_arg_defs.min_quantizer,
+  &g_av1_codec_arg_defs.max_quantizer,
+  &g_av1_codec_arg_defs.undershoot_pct,
+  &g_av1_codec_arg_defs.overshoot_pct,
+  &g_av1_codec_arg_defs.buf_sz,
+  &g_av1_codec_arg_defs.buf_initial_sz,
+  &g_av1_codec_arg_defs.buf_optimal_sz,
+  &g_av1_codec_arg_defs.bias_pct,
+  &g_av1_codec_arg_defs.minsection_pct,
+  &g_av1_codec_arg_defs.maxsection_pct,
+  NULL
+};
+
+static const arg_def_t *const kf_args[] = {
+  &g_av1_codec_arg_defs.fwd_kf_enabled,
+  &g_av1_codec_arg_defs.kf_min_dist,
+  &g_av1_codec_arg_defs.kf_max_dist,
+  &g_av1_codec_arg_defs.kf_disabled,
+  &g_av1_codec_arg_defs.sframe_dist,
+  &g_av1_codec_arg_defs.sframe_mode,
+  NULL
+};
 
 // TODO(bohanli): Currently all options are supported by the key & value API.
 // Consider removing the control ID usages?
-const arg_def_t *av1_ctrl_args[] = {
+static const arg_def_t *const av1_ctrl_args[] = {
   &g_av1_codec_arg_defs.cpu_used_av1,
   &g_av1_codec_arg_defs.auto_altref,
   &g_av1_codec_arg_defs.sharpness,
@@ -451,7 +457,7 @@ const arg_def_t *av1_ctrl_args[] = {
   NULL,
 };
 
-const arg_def_t *av1_key_val_args[] = {
+static const arg_def_t *const av1_key_val_args[] = {
   &g_av1_codec_arg_defs.passes,
   &g_av1_codec_arg_defs.two_pass_output,
   &g_av1_codec_arg_defs.second_pass_log,
@@ -463,7 +469,7 @@ const arg_def_t *av1_key_val_args[] = {
   NULL,
 };
 
-static const arg_def_t *no_args[] = { NULL };
+static const arg_def_t *const no_args[] = { NULL };
 
 static void show_help(FILE *fout, int shorthelp) {
   fprintf(fout, "Usage: %s <options> -o dst_filename src_filename\n",
@@ -935,8 +941,8 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
                                struct stream_state *stream, char **argv) {
   char **argi, **argj;
   struct arg arg;
-  static const arg_def_t **ctrl_args = no_args;
-  static const arg_def_t **key_val_args = no_args;
+  static const arg_def_t *const *ctrl_args = no_args;
+  static const arg_def_t *const *key_val_args = no_args;
   static const int *ctrl_args_map = NULL;
   struct stream_config *config = &stream->config;
   int eos_mark_found = 0;
-- 
GitLab


From 1e9c7a2416003d79c92e90aacf19a087c2ef4a38 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 13 Jun 2024 11:53:21 -0700
Subject: [PATCH 254/391] Add encoder control for setting column/row tiling

If the control is enabled (set to 1): the number of
column and row tiles is set, based on number of input
threads and resolution.

Add SVC unittest with 4 threads with new control.

Bug:b/349189136
Change-Id: I8e39fe1938d2e7470a1327254c3db02998ce75b7
---
 aom/aomcx.h                 | 11 +++++
 apps/aomenc.c               |  1 +
 av1/arg_defs.c              |  2 +
 av1/arg_defs.h              |  1 +
 av1/av1_cx_iface.c          | 90 ++++++++++++++++++++++++++++++++++---
 av1/encoder/encoder.h       |  1 +
 examples/svc_encoder_rtc.cc |  5 +--
 test/datarate_test.h        | 10 ++++-
 test/svc_datarate_test.cc   | 43 +++++++++++++++++-
 9 files changed, 150 insertions(+), 14 deletions(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 002b5d37d0..835eab84de 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1539,6 +1539,14 @@ enum aome_enc_control_id {
    */
   AV1E_SET_SVC_FRAME_DROP_MODE = 165,
 
+  /*!\brief Codec control to set auto tiling, unsigned int parameter.
+   * Value of 1 means encoder will set number of tile_columns and tile_rows,
+   * based on the number of threads and resolution. This will override any
+   * settings set via SET_TILE_COLUMNS/ROWS. If the value is 0 no change is
+   * done, the previous setting (if any) for tile_columns/rows is preserved.
+   */
+  AV1E_SET_AUTO_TILES = 166,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -2196,6 +2204,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_SVC_FRAME_DROP_MODE, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
 #define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
 
+AOM_CTRL_USE_TYPE(AV1E_SET_AUTO_TILES, unsigned int)
+#define AOM_CTRL_AV1E_SET_AUTO_TILES
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 00461e09a1..0bfce0ae1a 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -466,6 +466,7 @@ static const arg_def_t *const av1_key_val_args[] = {
   &g_av1_codec_arg_defs.sb_qp_sweep,
   &g_av1_codec_arg_defs.dist_metric,
   &g_av1_codec_arg_defs.kf_max_pyr_height,
+  &g_av1_codec_arg_defs.auto_tiles,
   NULL,
 };
 
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index d0f6814e5e..7aef17c86e 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -315,6 +315,8 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
       ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"),
   .tile_rows =
       ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"),
+  .auto_tiles = ARG_DEF(NULL, "auto-tiles", 1,
+                        "Enable auto tiles (0: false (default), 1: true)"),
   .enable_tpl_model = ARG_DEF(NULL, "enable-tpl-model", 1,
                               "RDO based on frame temporal dependency "
                               "(0: off, 1: backward source based); "
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index b7e8440f15..a206036f46 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -124,6 +124,7 @@ typedef struct av1_codec_arg_definitions {
   arg_def_t fpmtarg;
   arg_def_t tile_cols;
   arg_def_t tile_rows;
+  arg_def_t auto_tiles;
   arg_def_t enable_tpl_model;
   arg_def_t enable_keyframe_filtering;
   arg_def_t tile_width;
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index f6ef0ca4e3..a03fed9fc2 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <limits.h>
+#include <math.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -57,6 +58,7 @@ struct av1_extracfg {
   unsigned int fp_mt;
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
+  unsigned int auto_tiles;
   unsigned int enable_tpl_model;
   unsigned int enable_keyframe_filtering;
   unsigned int arnr_max_frames;
@@ -237,6 +239,7 @@ static const struct av1_extracfg default_extra_cfg = {
   0,              // fp_mt
   0,              // tile_columns
   0,              // tile_rows
+  0,              // auto_tiles
   0,              // enable_tpl_model
   1,              // enable_keyframe_filtering
   7,              // arnr_max_frames
@@ -388,6 +391,7 @@ static const struct av1_extracfg default_extra_cfg = {
   0,              // fp_mt
   0,              // tile_columns
   0,              // tile_rows
+  0,              // auto_tiles
   1,              // enable_tpl_model
   1,              // enable_keyframe_filtering
   7,              // arnr_max_frames
@@ -751,6 +755,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
   RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
   RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+  RANGE_CHECK_HI(extra_cfg, auto_tiles, 1);
 
   RANGE_CHECK_HI(cfg, monochrome, 1);
 
@@ -986,6 +991,40 @@ static void disable_superres(SuperResCfg *const superres_cfg) {
   superres_cfg->superres_kf_qthresh = 255;
 }
 
+static void set_auto_tiles(TileConfig *const tile_cfg, unsigned int width,
+                           unsigned int height, unsigned int threads) {
+  int tile_cols_log2 = 0;
+  int tile_rows_log2 = 0;
+  if (threads < 2) return;
+  // Avoid small tiles because they are particularly bad for coding.
+  // Use no more tiles than the number of threads. Aim for one tile per
+  // thread. Using more than one thread inside one tile could be less
+  // efficient. Using more tiles than the number of threads would result
+  // in a compression penalty without much benefit.
+  const uint32_t kMinTileArea = 128 * 128;
+  const uint32_t kMaxTiles = 32;
+  uint32_t frame_area = width * height;
+  uint32_t tiles = (frame_area + kMinTileArea - 1) / kMinTileArea;
+  if (tiles > kMaxTiles) {
+    tiles = kMaxTiles;
+  }
+  if (tiles > threads) {
+    tiles = threads;
+  }
+  int tiles_log2 = (int)log2(tiles);
+  // If the frame width is equal or greater than the height, use more tile
+  // columns than tile rows.
+  if (width >= height) {
+    tile_cols_log2 = (tiles_log2 + 1) / 2;
+    tile_rows_log2 = tiles_log2 - tile_cols_log2;
+  } else {
+    tile_rows_log2 = (tiles_log2 + 1) / 2;
+    tile_cols_log2 = tiles_log2 - tile_rows_log2;
+  }
+  tile_cfg->tile_columns = tile_cols_log2;
+  tile_cfg->tile_rows = tile_rows_log2;
+}
+
 static void update_default_encoder_config(const cfg_options_t *cfg,
                                           struct av1_extracfg *extra_cfg) {
   extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0;
@@ -1326,8 +1365,14 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
   tile_cfg->enable_large_scale_tile = cfg->large_scale_tile;
   tile_cfg->enable_single_tile_decoding =
       (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
-  tile_cfg->tile_columns = extra_cfg->tile_columns;
-  tile_cfg->tile_rows = extra_cfg->tile_rows;
+  if (extra_cfg->auto_tiles) {
+    set_auto_tiles(tile_cfg, cfg->g_w, cfg->g_h, cfg->g_threads);
+    extra_cfg->tile_columns = tile_cfg->tile_columns;
+    extra_cfg->tile_rows = tile_cfg->tile_rows;
+  } else {
+    tile_cfg->tile_columns = extra_cfg->tile_columns;
+    tile_cfg->tile_rows = extra_cfg->tile_rows;
+  }
   tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
   tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
   for (int i = 0; i < tile_cfg->tile_width_count; i++) {
@@ -1688,18 +1733,28 @@ static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
 
 static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
-  unsigned int tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
-  if (tile_columns == ctx->extra_cfg.tile_columns) return AOM_CODEC_OK;
+  // If the control AUTO_TILES is used (set to 1) then don't override
+  // the tile_columns set via the AUTO_TILES control.
+  if (ctx->extra_cfg.auto_tiles) {
+    ERROR("AUTO_TILES is set so AV1E_SET_TILE_COLUMNS should not be called.");
+  }
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  unsigned int tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+  if (tile_columns == extra_cfg.tile_columns) return AOM_CODEC_OK;
   extra_cfg.tile_columns = tile_columns;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
 static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
-  unsigned int tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
-  if (tile_rows == ctx->extra_cfg.tile_rows) return AOM_CODEC_OK;
+  // If the control AUTO_TILES is used (set to 1) then don't override
+  // the tile_rows set via the AUTO_TILES control.
+  if (ctx->extra_cfg.auto_tiles) {
+    ERROR("AUTO_TILES is set so AV1E_SET_TILE_ROWS should not be called.");
+  }
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  unsigned int tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+  if (tile_rows == extra_cfg.tile_rows) return AOM_CODEC_OK;
   extra_cfg.tile_rows = tile_rows;
   return update_extra_cfg(ctx, &extra_cfg);
 }
@@ -2637,6 +2692,15 @@ static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx,
     return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_auto_tiles(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  unsigned int auto_tiles = CAST(AV1E_SET_AUTO_TILES, args);
+  if (auto_tiles == ctx->extra_cfg.auto_tiles) return AOM_CODEC_OK;
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.auto_tiles = auto_tiles;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 #if !CONFIG_REALTIME_ONLY
 static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                            STATS_BUFFER_CTX *stats_buf_context,
@@ -3916,9 +3980,22 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_cols, argv,
                               err_string)) {
     extra_cfg.tile_columns = arg_parse_uint_helper(&arg, err_string);
+    if (extra_cfg.auto_tiles) {
+      snprintf(err_string, ARG_ERR_MSG_MAX_LEN,
+               "Cannot set tile-cols because auto-tiles is already set.");
+      err = AOM_CODEC_INVALID_PARAM;
+    }
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_rows, argv,
                               err_string)) {
     extra_cfg.tile_rows = arg_parse_uint_helper(&arg, err_string);
+    if (extra_cfg.auto_tiles) {
+      snprintf(err_string, ARG_ERR_MSG_MAX_LEN,
+               "Cannot set tile-rows because auto-tiles is already set.");
+      err = AOM_CODEC_INVALID_PARAM;
+    }
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_tiles, argv,
+                              err_string)) {
+    extra_cfg.auto_tiles = arg_parse_uint_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tpl_model,
                               argv, err_string)) {
     extra_cfg.enable_tpl_model = arg_parse_uint_helper(&arg, err_string);
@@ -4504,6 +4581,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr },
   { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr },
   { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode },
+  { AV1E_SET_AUTO_TILES, ctrl_set_auto_tiles },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 5966da7381..38fe8a6fa3 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3790,6 +3790,7 @@ AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
 int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
                                     AV1_PRIMARY *const ppi,
                                     int *ref_buffers_used_map);
+
 /*!\endcond */
 
 /*!\brief Obtain the raw frame data
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index 58f103f119..c9ff20cdf3 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -1689,10 +1689,7 @@ int main(int argc, const char **argv) {
   aom_codec_control(&codec, AV1E_SET_ENABLE_FILTER_INTRA, 0);
   aom_codec_control(&codec, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1);
 
-  if (cfg.g_threads > 1) {
-    aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
-                      (unsigned int)log2(cfg.g_threads));
-  }
+  aom_codec_control(&codec, AV1E_SET_AUTO_TILES, 1);
 
   aom_codec_control(&codec, AV1E_SET_TUNE_CONTENT, app_input.tune_content);
   if (app_input.tune_content == AOM_CONTENT_SCREEN) {
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 24ce832a6b..af35dff8dc 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -44,6 +44,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     denoiser_offon_period_ = -1;
     tile_columns_ = 0;
     tile_rows_ = 0;
+    auto_tiles_ = false;
     screen_mode_ = false;
     max_perc_spike_ = 1.0;
     max_perc_spike_high_ = 1.0;
@@ -63,8 +64,12 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
-      encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+      if (auto_tiles_) {
+        encoder->Control(AV1E_SET_AUTO_TILES, 1);
+      } else {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+        encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+      }
       encoder->Control(AV1E_SET_ROW_MT, 1);
       if (cfg_.g_usage == AOM_USAGE_REALTIME) {
         encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
@@ -207,6 +212,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
   bool speed_change_test_;
   int tile_columns_;
   int tile_rows_;
+  bool auto_tiles_;
   bool screen_mode_;
   double max_perc_spike_;
   double max_perc_spike_high_;
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 2284ce524c..2a540e1e2b 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -118,8 +118,12 @@ class DatarateTestSVC
       encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
       encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
       if (cfg_.g_threads > 1) {
-        encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
-        encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+        if (auto_tiles_) {
+          encoder->Control(AV1E_SET_AUTO_TILES, 1);
+        } else {
+          encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+          encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+        }
         encoder->Control(AV1E_SET_ROW_MT, 1);
       }
       if (screen_mode_) {
@@ -1672,6 +1676,36 @@ class DatarateTestSVC
     }
   }
 
+  virtual void BasicRateTargetingSVC2TL1SLHDMultiThread4AutoTilesTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_threads = 4;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    auto_tiles_ = 1;
+    number_temporal_layers_ = 2;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingSVC3TL3SLHDMultiThread4Test() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -2541,6 +2575,11 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLHDMultiThread4) {
   BasicRateTargetingSVC2TL1SLHDMultiThread4Test();
 }
 
+// Check basic rate targeting for CBR, for 1 spatial, 2 temporal layers,
+// for 4 threads, row-mt enabled, and auto_tiling enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLHDMultiThread4AutoTiles) {
+  BasicRateTargetingSVC2TL1SLHDMultiThread4AutoTilesTest();
+}
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
 // for 4 threads, 2 tile_columns, 2 tiles_rows, row-mt enabled.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread4) {
-- 
GitLab


From 5a46d2961fb233c8f099a7bc18a7a54c8883813b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jun 2024 13:55:56 -0700
Subject: [PATCH 255/391] disable av1_{highbd_,}dr_prediction_z2_neon w/armv7

These two functions cause a SIGBUS.

Bug: aomedia:349428506, b:345667979, b:347825582
Change-Id: I9a25c224cf61fbf1053a05e0e4a0c830751b1fd4
---
 aom_dsp/arm/highbd_intrapred_neon.c |  3 ++
 aom_dsp/arm/intrapred_neon.c        |  3 ++
 av1/common/av1_rtcd_defs.pl         | 14 +++++++--
 build/cmake/rtcd.pl                 |  6 ++--
 test/dr_prediction_test.cc          | 44 +++++++++++++++++++++++++++++
 5 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 3eda2ca462..5e6118dc6f 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -1604,6 +1604,8 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
 }
 #endif  // AOM_ARCH_AARCH64
 
+// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
     const uint16_t *left, const int16x4_t indices, int n) {
   assert(n > 0);
@@ -2473,6 +2475,7 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
   assert(f != NULL);
   f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
 }
+#endif  // AOM_ARCH_AARCH64
 
 // -----------------------------------------------------------------------------
 // Z3
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 3c12ca3c19..561a9f76a1 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1488,6 +1488,8 @@ void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 
 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
 
+// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
 #if !AOM_ARCH_AARCH64
 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -2038,6 +2040,7 @@ void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
       break;
   }
 }
+#endif  // AOM_ARCH_AARCH64
 
 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d08d2194d3..86f83a69e5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -115,7 +115,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
 specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
+# TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
+if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
+  specialize qw/av1_dr_prediction_z2 sse4_1 avx2/;
+} else {
+  specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
+}
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
 
@@ -220,7 +225,12 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/;
   add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-  specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
+  # TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
+  if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
+    specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+  } else {
+    specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
+  }
   add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/;
 }
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index dc827c2deb..5d889cb748 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -58,13 +58,15 @@ open CONFIG_FILE, $opts{config} or
 
 my %config = ();
 while (<CONFIG_FILE>) {
-  next if !/^#define\s+(?:CONFIG_|HAVE_)/;
+  # TODO(aomedia:349428506,349436249,349450845,349455146): remove AOM_ARCH_
+  # after armv7 SIGBUS issues are fixed.
+  next if !/^#define\s+(?:AOM_ARCH_|CONFIG_|HAVE_)/;
   chomp;
   my @line_components = split /\s/;
   scalar @line_components > 2 or
     die "Invalid input passed to rtcd.pl via $opts{config}.";
   # $line_components[0] = #define
-  # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING)
+  # $line_components[1] = flag name ({AOM_ARCH,CONFIG,HAVE}_SOMETHING)
   # $line_components[2] = flag value (0 or 1)
   $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : "";
 }
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 0938a3db11..20cf600320 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -484,6 +484,7 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, LowbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
@@ -495,8 +496,21 @@ INSTANTIATE_TEST_SUITE_P(
                       DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
                                          &z3_wrapper<av1_dr_prediction_z3_neon>,
                                          AOM_BITS_8, kZ3Start)));
+#else
+// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
+// after SIGBUS is fixed.
+INSTANTIATE_TEST_SUITE_P(
+    NEON, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         &z1_wrapper<av1_dr_prediction_z1_neon>,
+                                         AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         &z3_wrapper<av1_dr_prediction_z3_neon>,
+                                         AOM_BITS_8, kZ3Start)));
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
+#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, HighbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred_Hbd>(
@@ -535,6 +549,36 @@ INSTANTIATE_TEST_SUITE_P(
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
                           AOM_BITS_12, kZ3Start)));
+#else   // !AOM_ARCH_AARCH64
+// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
+// after SIGBUS is fixed.
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HighbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+                          AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+                          AOM_BITS_10, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+                          AOM_BITS_12, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+                          AOM_BITS_8, kZ3Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+                          AOM_BITS_10, kZ3Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+                          AOM_BITS_12, kZ3Start)));
+#endif  // AOM_ARCH_AARCH64
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // HAVE_NEON
-- 
GitLab


From 110978ffa5ef0520406f7f922dbea300488aa468 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jun 2024 14:05:41 -0700
Subject: [PATCH 256/391] disable av1_filter_intra_predictor_neon w/armv7

This function causes a SIGBUS.

Bug: aomedia:349436249
Change-Id: I1578921ff352df2a3be97af1384444f7042da843
---
 av1/common/arm/reconintra_neon.c | 4 ++++
 av1/common/av1_rtcd_defs.pl      | 7 ++++++-
 test/filterintra_test.cc         | 5 +++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index a9b52d7788..81eb224538 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -21,6 +21,9 @@
 
 #define MAX_UPSAMPLE_SZ 16
 
+// TODO(aomedia:349436249): enable for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
+
 // These kernels are a transposed version of those defined in reconintra.c,
 // with the absolute value of the negatives taken in the top row.
 DECLARE_ALIGNED(16, const uint8_t,
@@ -209,6 +212,7 @@ void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
     } while (r < height + 1);
   }
 }
+#endif  // AOM_ARCH_AARCH64
 
 void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
   if (!strength) return;
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 86f83a69e5..e9f6cbc614 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -126,7 +126,12 @@ specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
 
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
-specialize qw/av1_filter_intra_predictor sse4_1 neon/;
+# TODO(aomedia:349436249): enable NEON for armv7 after SIGBUS is fixed.
+if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
+  specialize qw/av1_filter_intra_predictor sse4_1/;
+} else {
+  specialize qw/av1_filter_intra_predictor sse4_1 neon/;
+}
 
 # High bitdepth functions
 
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index 5d9b839c1d..1d6c7f7989 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -171,6 +171,8 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSE4_1
 
 #if HAVE_NEON
+// TODO(aomedia:349436249): enable for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
 const PredFuncMode kPredFuncMdArrayNEON[] = {
   make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
              FILTER_DC_PRED),
@@ -192,6 +194,9 @@ INSTANTIATE_TEST_SUITE_P(
     NEON, AV1FilterIntraPredTest,
     ::testing::Combine(::testing::ValuesIn(kPredFuncMdArrayNEON),
                        ::testing::ValuesIn(kTxSizeNEON)));
+#else   // !AOM_ARCH_AARCH64
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FilterIntraPredTest);
+#endif  // AOM_ARCH_AARCH64
 #endif  // HAVE_NEON
 
 }  // namespace
-- 
GitLab


From def2ea6cf01a557b7dcbbc804f854d7b14813ae3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jun 2024 15:20:26 -0700
Subject: [PATCH 257/391] disable av1_estimate_noise_from_single_plane_neon
 w/armv7

This function causes a SIGBUS.

Bug: aomedia:349450845
Change-Id: Ib1f7b8c41dc711f82cf5f573ab45aadea7b5f2af
---
 av1/common/av1_rtcd_defs.pl            | 7 ++++++-
 av1/encoder/arm/temporal_filter_neon.c | 3 +++
 test/temporal_filter_test.cc           | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e9f6cbc614..c19c483c11 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -386,7 +386,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/av1_apply_temporal_filter sse2 avx2 neon neon_dotprod/;
 
     add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
-    specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
+    # TODO(aomedia:349450845): enable NEON for armv7 after SIGBUS is fixed.
+    if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
+      specialize qw/av1_estimate_noise_from_single_plane avx2/;
+    } else {
+      specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
+    }
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 103324fbe5..dffa36898a 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -283,6 +283,8 @@ void av1_apply_temporal_filter_neon(
   }
 }
 
+// TODO(aomedia:349450845): enable for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
 double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
                                                  int width, int stride,
                                                  int edge_thresh) {
@@ -546,3 +548,4 @@ double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
              ? -1.0
              : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
 }
+#endif  // AOM_ARCH_AARCH64
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 52e2366aa8..be11e2c63e 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -413,12 +413,15 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+// TODO(aomedia:349450845): enable for armv7 after SIGBUS is fixed.
+#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, EstimateNoiseTest,
     ::testing::Combine(
         ::testing::Values(av1_estimate_noise_from_single_plane_c),
         ::testing::Values(av1_estimate_noise_from_single_plane_neon),
         ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
+#endif  // AOM_ARCH_AARCH64
 #endif  // HAVE_NEON
 
 #if CONFIG_AV1_HIGHBITDEPTH
-- 
GitLab


From 8f8e1e1bdd169997e2e808cad9cdc5bfa0e0b4fe Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 25 Jun 2024 15:41:01 -0700
Subject: [PATCH 258/391] disable av1_highbd_warp_affine_neon w/armv7

This function causes a SIGBUS.

Bug: aomedia:349455146
Change-Id: If6aed35237c1f78043b7d3207337e6efecc0a847
---
 av1/av1.cmake               | 6 +++++-
 av1/common/av1_rtcd_defs.pl | 7 ++++++-
 test/warp_filter_test.cc    | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index bed6ab9220..6713f14626 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -487,8 +487,12 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c"
-              "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
+  # TODO(aomedia:349455146): enable this for armv7 after SIGBUS is fixed.
+  if(AOM_ARCH_AARCH64)
+    list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+                "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c")
+  endif()
 
   list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
               "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c19c483c11..0dde8b941b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -534,7 +534,12 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
+  # TODO(aomedia:349450845): enable NEON for armv7 after SIGBUS is fixed.
+  if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
+    specialize qw/av1_highbd_warp_affine sse4_1 avx2 sve/;
+  } else {
+    specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
+  }
 }
 
 add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index bade6799b9..56e1022dd6 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -72,9 +72,14 @@ INSTANTIATE_TEST_SUITE_P(
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
 
 #if CONFIG_AV1_HIGHBITDEPTH
+#if AOM_ARCH_AARCH64
+// TODO(aomedia:349455146): enable for armv7 after SIGBUS is fixed.
 INSTANTIATE_TEST_SUITE_P(
     NEON, AV1HighbdWarpFilterTest,
     libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_neon));
+#else   // !AOM_ARCH_AARCH64
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdWarpFilterTest);
+#endif  // AOM_ARCH_AARCH64
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
-- 
GitLab


From 4470f1a9ade0e806f66fba52334eb6b8aabd3eef Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 26 Jun 2024 15:52:57 -0700
Subject: [PATCH 259/391] ethread_test: add AVxFirstPassEncoderThreadTestLarge

and move speeds 0 and 2 from AVxFirstPassEncoderThreadTest to it. This
test does multiple encodes at 720P, which can be quite slow under some
sanitizers.

Change-Id: Ifd374af65c2ed1c38e9470630e42dc691b3f30e5
---
 test/ethread_test.cc | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index fd5f3dd0ce..da3be3fb2d 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -102,6 +102,8 @@ class AVxFirstPassEncoderThreadTest
     firstpass_stats_.sz += pkt_size;
   }
 
+  void DoTest();
+
   bool encoder_initialized_;
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
@@ -129,7 +131,7 @@ static void compare_fp_stats_md5(aom_fixed_buf_t *fp_stats) {
       << "MD5 checksums don't match";
 }
 
-TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+void AVxFirstPassEncoderThreadTest::DoTest() {
   ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   aom_fixed_buf_t firstpass_stats;
   size_t single_run_sz;
@@ -201,6 +203,13 @@ TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   // Comparison 4 (between threads=4 and threads=8).
   compare_fp_stats_md5(&firstpass_stats);
 }
+
+TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) { DoTest(); }
+
+using AVxFirstPassEncoderThreadTestLarge = AVxFirstPassEncoderThreadTest;
+
+TEST_P(AVxFirstPassEncoderThreadTestLarge, FirstPassStatsTest) { DoTest(); }
+
 #endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadTest
@@ -504,10 +513,14 @@ TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) {
 // first pass stats test
 AV1_INSTANTIATE_TEST_SUITE(AVxFirstPassEncoderThreadTest,
                            ::testing::Values(::libaom_test::kTwoPassGood),
-                           ::testing::Range(0, 6, 2), ::testing::Range(0, 2),
+                           ::testing::Values(4), ::testing::Range(0, 2),
+                           ::testing::Range(1, 3));
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFirstPassEncoderThreadTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(0, 2), ::testing::Range(0, 2),
                            ::testing::Range(1, 3));
 
-// For AV1, test speed 0, 1, 2, 3, 5.
 // Only test cpu_used 2 here.
 AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest,
                            ::testing::Values(::libaom_test::kTwoPassGood),
-- 
GitLab


From e592429eef9acaaeca99978d69836a87fb6ebaf8 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Jun 2024 12:46:23 -0700
Subject: [PATCH 260/391] avx2: use yy_loadu2_128 from synonyms_avx2.h

This removes identical or close to identical implementations. It's
unclear if the use of _mm_lddqu_si128() in masked_sad_intrin_avx2.c was
intentional, but for now we'll normalize on _mm_loadu_si128(); any
benefits should be minor.

Bug: b:300649160
Change-Id: I535b4b74855579edb647f5469a815968e00e1b28
---
 aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 35 +++++++++--------------
 aom_dsp/x86/avg_intrin_avx2.c             | 23 ++++++---------
 aom_dsp/x86/masked_sad_intrin_avx2.c      | 22 ++++++--------
 3 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 1f382d110b..0c4c537a50 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
 
 #include "aom_dsp/x86/convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 #if defined(__clang__)
@@ -61,12 +62,6 @@ static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
                    _mm256_extractf128_si256(*a, 1));
 }
 
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
                                    const ptrdiff_t stride, const __m256i *a) {
   _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
@@ -100,7 +95,7 @@ static void aom_filter_block1d4_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -188,7 +183,7 @@ static void aom_filter_block1d4_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -295,7 +290,7 @@ static void aom_filter_block1d8_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
@@ -397,7 +392,7 @@ static void aom_filter_block1d8_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -521,7 +516,7 @@ static void aom_filter_block1d16_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
@@ -535,8 +530,7 @@ static void aom_filter_block1d16_h4_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
@@ -644,7 +638,7 @@ static void aom_filter_block1d16_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -670,8 +664,7 @@ static void aom_filter_block1d16_h8_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
 
     // filter the source buffer
     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -1068,7 +1061,7 @@ static void aom_filter_block1d16_v4_avx2(
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
-  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg23 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
   srcReg4x = _mm256_castsi128_si256(
       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
 
@@ -1172,11 +1165,9 @@ static void aom_filter_block1d16_v8_avx2(
   dst_stride = out_pitch << 1;
 
   // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b1 = yy_loadu2_128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 = yy_loadu2_128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
   srcReg32b7 = _mm256_castsi128_si256(
       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6e943b84b3..ba6de96d24 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
@@ -542,28 +543,22 @@ int aom_satd_lp_avx2(const int16_t *coeff, int length) {
   }
 }
 
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
   const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
   const uint8_t *s_y1 = s_y0 + 8 * p;
   __m256i sum0, sum1, s0, s1, s2, s3, u0;
   u0 = _mm256_setzero_si256();
-  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
-  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
-  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
-  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+  s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1, s_y0), u0);
+  s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + p, s_y0 + p), u0);
+  s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+  s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
   sum0 = _mm256_add_epi16(s0, s1);
   sum1 = _mm256_add_epi16(s2, s3);
-  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
-  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
-  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
-  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+  s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+  s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+  s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+  s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
   sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
   sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
   sum0 = _mm256_add_epi16(sum0, sum1);
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index a8097bf4a1..d157d7d625 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -17,6 +17,7 @@
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 static INLINE unsigned int masked_sad32xh_avx2(
@@ -67,13 +68,6 @@ static INLINE unsigned int masked_sad32xh_avx2(
   return sad;
 }
 
-static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
-  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
-  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
-  __m256i a = _mm256_castsi128_si256(a0);
-  return _mm256_inserti128_si256(a, a1, 1);
-}
-
 static INLINE unsigned int masked_sad16xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
@@ -84,10 +78,10 @@ static INLINE unsigned int masked_sad16xh_avx2(
   const __m256i round_scale =
       _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr);
+    const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr);
+    const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr);
+    const __m256i m = yy_loadu2_128(m_ptr + m_stride, m_ptr);
     const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
 
     // Calculate 16 predicted pixels.
@@ -217,9 +211,9 @@ static INLINE unsigned int highbd_masked_sad8xh_avx2(
   const __m256i one = _mm256_set1_epi16(1);
 
   for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr);
+    const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr);
+    const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr);
     // Zero-extend mask to 16 bits
     const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
         _mm_loadl_epi64((const __m128i *)(m_ptr)),
-- 
GitLab


From 38736a8a99432d9f8dbc2580b857d4f43ecc1774 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 20 Jun 2024 16:55:14 -0700
Subject: [PATCH 261/391] synonyms_avx2.h: add yy_loadu_4x64()

Use this instead of _mm256_set_epi64x() to load unaligned 64-bit values.
This quiets undefined sanitizer warnings of the form:
aom_dsp/x86/blend_a64_mask_avx2.c:913:40: runtime error: load of
  misaligned address 0x000009e7542c for type 'int64_t' (aka 'long'),
  which requires 8 byte alignment

Bug: b:300649160
Change-Id: I9be724c3461665f1800c599768a7609c0e57af69
---
 aom_dsp/x86/blend_a64_mask_avx2.c | 14 ++++++--------
 aom_dsp/x86/synonyms_avx2.h       | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 638c378b42..2b7fe838d6 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -910,14 +910,12 @@ static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
     const __m256i *round_offset, int shift, const __m256i *clip_low,
     const __m256i *clip_high, const __m256i *mask_max) {
   // Load 4x u16 pixels from each of 4 rows from each source
-  const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
-                                       *(int64_t *)(src0 + 2 * src0_stride),
-                                       *(int64_t *)(src0 + 1 * src0_stride),
-                                       *(int64_t *)(src0 + 0 * src0_stride));
-  const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
-                                       *(int64_t *)(src1 + 2 * src1_stride),
-                                       *(int64_t *)(src1 + 1 * src1_stride),
-                                       *(int64_t *)(src1 + 0 * src1_stride));
+  const __m256i s0 =
+      yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride,
+                    src0 + 1 * src0_stride, src0 + 0 * src0_stride);
+  const __m256i s1 =
+      yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride,
+                    src1 + 1 * src1_stride, src1 + 0 * src1_stride);
   // Generate the inverse mask
   const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
 
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 2a130ef7f6..53f5028acc 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -60,6 +60,22 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
+// This behaves similarly to _mm256_set_epi64x(), but avoids undefined
+// sanitizer warnings when loading values from unaligned buffers using
+// `*(int64_t *)val`.
+static INLINE __m256i yy_loadu_4x64(const void *e3, const void *e2,
+                                    const void *e1, const void *e0) {
+  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
+  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
+  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
+  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
+  // Note this can be replaced with
+  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
+  // _mm256_set_m128d() with all supported compilers. This version is used to
+  // match the behavior with yy_set_m128i().
+  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
+}
+
 static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
-- 
GitLab


From ac00060bfb5c3fdaa4581db0a204a559966197d2 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 25 Jun 2024 18:32:48 -0700
Subject: [PATCH 262/391] Remove static from vars in parse_stream_params()

Those variables in parse_stream_params() don't need to be function-scope
static variables.

Change-Id: I0bd9dc3ef09fa6ce78d75096acd6e52e936261b4
---
 apps/aomenc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/aomenc.c b/apps/aomenc.c
index 0bfce0ae1a..ff60ef0112 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -942,9 +942,9 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
                                struct stream_state *stream, char **argv) {
   char **argi, **argj;
   struct arg arg;
-  static const arg_def_t *const *ctrl_args = no_args;
-  static const arg_def_t *const *key_val_args = no_args;
-  static const int *ctrl_args_map = NULL;
+  const arg_def_t *const *ctrl_args = no_args;
+  const arg_def_t *const *key_val_args = no_args;
+  const int *ctrl_args_map = NULL;
   struct stream_config *config = &stream->config;
   int eos_mark_found = 0;
   int webm_forced = 0;
-- 
GitLab


From 64472491b6bf3c4a44b1d5f8a7714ad967d82d8a Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 27 Jun 2024 13:47:53 -0700
Subject: [PATCH 263/391] Remove MSVC version check in x86 workaround

The MSVC version check is not future-proof unless the bug has been
fixed. Since we have not even submitted a bug report, this bug is
unlikely to be fixed soon. So remove the MSVC version check for the
workaround.

Update the comment to note that this bug can still be reproduced with
Visual Studio 2022 version 17.10.3.

Bug: 349646650
Change-Id: If2935164d20a52b0c7daf7f88ea8ae588e2f6efa
---
 av1/common/cdef_block_simd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 58cfd3183e..5b04909a35 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -198,8 +198,8 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
 }
 
 // Work around compiler out of memory issues with Win32 builds. This issue has
-// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940
+// been observed with Visual Studio 2017, 2019, and 2022 (version 17.10.3).
+#if defined(_MSC_VER) && defined(_M_IX86)
 #define CDEF_INLINE static INLINE
 #else
 #define CDEF_INLINE SIMD_INLINE
-- 
GitLab


From 34d29d40dd03eaaec297eacf5d2da2a8c864f35e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 27 Jun 2024 10:28:24 -0700
Subject: [PATCH 264/391] Handle w=2 case in aom_highbd_convolve_copy_neon()

Fix the incorrect assumption that if w < 8, then w == 4. w may be equal
to 2.

Tested:
cmake ../aom -G Ninja -DFORCE_HIGHBITDEPTH_DECODING=1
ninja
./test_libaom --gtest_filter=*TestVectorTest*

cmake ../aom -G Ninja -DSANITIZE=address
ninja
./test_libaom --gtest_filter=*ConvolveCopy*

Bug: 349832592
Change-Id: I2600d00b097a94a079c4827cdc894d02cf03c42e
---
 aom_dsp/arm/aom_convolve_copy_neon.c | 13 +++++++++-
 test/av1_convolve_test.cc            | 39 +++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index b90b1bd0e1..447ae37e56 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -57,7 +57,18 @@ void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
-  if (w < 8) {  // copy4
+  if (w < 4) {  // copy2
+    do {
+      memcpy(dst, src, 4);
+      src += src_stride;
+      dst += dst_stride;
+
+      memcpy(dst, src, 4);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 4) {  // copy4
     uint16x4_t s0, s1;
     do {
       s0 = vld1_u16(src);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index aac8006e50..79bb942d0b 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -9,6 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
 #include <ostream>
 #include <set>
 #include <vector>
@@ -218,12 +222,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 8-bit output buffers are identical.
   void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
-                            int height) {
+                            int height, ptrdiff_t stride = kOutputStride) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += kOutputStride;
-        p2 += kOutputStride;
+        p1 += stride;
+        p2 += stride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -236,12 +240,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 16-bit output buffers are identical.
   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
-                            int height) {
+                            int height, ptrdiff_t stride = kOutputStride) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += kOutputStride;
-        p2 += kOutputStride;
+        p1 += stride;
+        p2 += stride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -1122,6 +1126,17 @@ class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
+
+    // Test again with dst_stride=width.
+    std::unique_ptr<uint8_t[]> reference2(new (std::nothrow)
+                                              uint8_t[width * height]);
+    ASSERT_NE(reference2, nullptr);
+    aom_convolve_copy_c(input, width, reference2.get(), width, width, height);
+    std::unique_ptr<uint8_t[]> test2(new (std::nothrow)
+                                         uint8_t[width * height]);
+    ASSERT_NE(test2, nullptr);
+    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
+    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };
 
@@ -1169,6 +1184,18 @@ class AV1ConvolveCopyHighbdTest
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
+
+    // Test again with dst_stride=width.
+    std::unique_ptr<uint16_t[]> reference2(new (std::nothrow)
+                                               uint16_t[width * height]);
+    ASSERT_NE(reference2, nullptr);
+    aom_highbd_convolve_copy_c(input, width, reference2.get(), width, width,
+                               height);
+    std::unique_ptr<uint16_t[]> test2(new (std::nothrow)
+                                          uint16_t[width * height]);
+    ASSERT_NE(test2, nullptr);
+    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
+    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };
 
-- 
GitLab


From e96d878eb369515211598051b5bb2eb553f78eca Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 27 Jun 2024 15:09:59 -0700
Subject: [PATCH 265/391] rtc: Set mv search_method to fast_diamond under high
 motion

For speed >= 11, under high motion screen content flag, set
the mv_search_method to fast_diamond for all frames, otherwise
for temporal layers the TL0 frame was being set to diamond
(which is slower); the TL1/TL2 frames were already being set
to fast_diamond.

This only affects temporal layers.
Measured ~5% speedup with small pnsr loss (< 0.05dB), for
high motion screen content clip at speed 11 with 2TL.

Change-Id: I070470b276e489a89dd814fda0d050fde2963c2c
---
 av1/encoder/speed_features.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 4b6ea16d7f..e0709b15ad 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1616,6 +1616,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
       sf->rt_sf.use_fast_fixed_part = 1;
       sf->rt_sf.increase_source_sad_thresh = 1;
       sf->rt_sf.selective_cdf_update = 1;
+      sf->mv_sf.search_method = FAST_DIAMOND;
     } else if (cpi->rc.max_block_source_sad > 20000 &&
                cpi->rc.frame_source_sad > 100 && speed >= 6 &&
                (cpi->rc.percent_blocks_with_motion > 1 ||
-- 
GitLab


From dd1e6c7806d041948fdc355ee6d78e33c829b150 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 28 Jun 2024 13:36:26 -0700
Subject: [PATCH 266/391] Tweak fix for aom_highbd_convolve_copy_neon() bug

Use memmove() instead of memcpy(), as in aom_highbd_convolve_copy_c()
and aom_highbd_convolve_copy_avx2(). I don't know if the src and dst
buffers may actually overlap, but it's good to be consistent.

Revert the new tests that use unaligned dst buffers, because
aom_highbd_convolve_copy_sse2() calls _mm_store_si128(), which requires
16-byte alignment.

Bug: 349832592

Change-Id: Ieddb00d39385fb690a4997ad0f9dbd2bfac946d7
---
 aom_dsp/arm/aom_convolve_copy_neon.c |  4 +--
 test/av1_convolve_test.cc            | 37 +++++-----------------------
 2 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 447ae37e56..a60f37f9e2 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -59,11 +59,11 @@ void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    int h) {
   if (w < 4) {  // copy2
     do {
-      memcpy(dst, src, 4);
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
 
-      memcpy(dst, src, 4);
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 79bb942d0b..8a5c166134 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -11,8 +11,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
-#include <new>
 #include <ostream>
 #include <set>
 #include <vector>
@@ -222,12 +220,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 8-bit output buffers are identical.
   void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
-                            int height, ptrdiff_t stride = kOutputStride) {
+                            int height) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += stride;
-        p2 += stride;
+        p1 += kOutputStride;
+        p2 += kOutputStride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -240,12 +238,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 16-bit output buffers are identical.
   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
-                            int height, ptrdiff_t stride = kOutputStride) {
+                            int height) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += stride;
-        p2 += stride;
+        p1 += kOutputStride;
+        p2 += kOutputStride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -1126,17 +1124,6 @@ class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
-
-    // Test again with dst_stride=width.
-    std::unique_ptr<uint8_t[]> reference2(new (std::nothrow)
-                                              uint8_t[width * height]);
-    ASSERT_NE(reference2, nullptr);
-    aom_convolve_copy_c(input, width, reference2.get(), width, width, height);
-    std::unique_ptr<uint8_t[]> test2(new (std::nothrow)
-                                         uint8_t[width * height]);
-    ASSERT_NE(test2, nullptr);
-    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
-    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };
 
@@ -1184,18 +1171,6 @@ class AV1ConvolveCopyHighbdTest
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
-
-    // Test again with dst_stride=width.
-    std::unique_ptr<uint16_t[]> reference2(new (std::nothrow)
-                                               uint16_t[width * height]);
-    ASSERT_NE(reference2, nullptr);
-    aom_highbd_convolve_copy_c(input, width, reference2.get(), width, width,
-                               height);
-    std::unique_ptr<uint16_t[]> test2(new (std::nothrow)
-                                          uint16_t[width * height]);
-    ASSERT_NE(test2, nullptr);
-    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
-    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };
 
-- 
GitLab


From 2c308fd916c54ce21e8a8b9f46c17393f273fe44 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 28 Jun 2024 16:26:32 -0700
Subject: [PATCH 267/391] Fix alignment assertions

Change-Id: I5bb81929abcb8107b0f2934fe33e6c2dd7ea2318
---
 aom_dsp/x86/aom_convolve_copy_avx2.c | 10 +++++++---
 aom_dsp/x86/aom_convolve_copy_sse2.c |  8 ++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
index bdbb4c16e9..5b90b104a8 100644
--- a/aom_dsp/x86/aom_convolve_copy_avx2.c
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -27,7 +27,9 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
 
 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
-  if (w >= 16) {
+  // The w == 16 case uses _mm_store_si128(), which requires its output address
+  // be aligned on a 16-byte boundary.
+  if (w == 16) {
     assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
   }
@@ -159,9 +161,11 @@ static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
 void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
-  if (w >= 16) {
+  // The w == 8 case uses _mm_store_si128(), which requires its output address
+  // be aligned on a 16-byte boundary.
+  if (w == 8) {
     assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
+    assert(!(dst_stride % 8));
   }
 
   if (w == 2) {
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index 887adde962..674a37fa49 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -35,6 +35,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
 
 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  // The w >= 16 cases use _mm_store_si128(), which requires its output address
+  // be aligned on a 16-byte boundary.
   if (w >= 16) {
     assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
@@ -200,9 +202,11 @@ static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
-  if (w >= 16) {
+  // The w >= 8 cases use _mm_store_si128(), which requires its output address
+  // be aligned on a 16-byte boundary.
+  if (w >= 8) {
     assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
+    assert(!(dst_stride % 8));
   }
 
   if (w == 2) {
-- 
GitLab


From fd4b24a1a854f845f409dd165749e2bb285f2c75 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 1 Jul 2024 14:06:53 -0700
Subject: [PATCH 268/391] Avoid an int cast of double in vbr_rate_correction

In the calculation of frame_window, avoid the int cast of
cpi->ppi->twopass.stats_buf_ctx->total_stats->count (a double) because
in theory it could be greater than INT_MAX. Use the int cast on the
result of AOMMIN(16, ...), which is <= 16

This is equivalent to the first change in the libvpx CL
https://chromium-review.googlesource.com/c/webm/libvpx/+/5670633.

Change-Id: Iced2e32aceb9e3be1505a9ef89494bc499afedaf
---
 av1/encoder/ratectrl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index cd73772b46..744f5f5cdc 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2598,12 +2598,12 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
 #endif
   int64_t frame_target = *this_frame_target;
 
-  const int stats_count =
+  const double stats_count =
       cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
-          ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
-          : 0;
-  const int frame_window = AOMMIN(
-      16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
+          ? cpi->ppi->twopass.stats_buf_ctx->total_stats->count
+          : 0.0;
+  const int frame_window =
+      (int)AOMMIN(16, stats_count - cpi->common.current_frame.frame_number);
   assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
   if (frame_window > 0) {
     const int64_t max_delta =
-- 
GitLab


From 75eeb52984907e6585a13c579c363e211922687b Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 2 Jul 2024 10:25:20 -0700
Subject: [PATCH 269/391] Adjust psnr threshold on sharpness test

Small adjustment, needed to make the nightly
tests pass. The tests only fail in the
-DCONFIG_FPMT_TEST=1 configuration.

Bug: 349890675
Change-Id: I14bed59496a7dd6d1731fdf0646e630d2e3ac272
---
 test/sharpness_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
index 5002a4469d..afd64450ea 100644
--- a/test/sharpness_test.cc
+++ b/test/sharpness_test.cc
@@ -30,7 +30,7 @@ const std::unordered_map<
     kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood),
                          { { 2, { { 2, 37.6 }, { 5, 37.6 } } },
                            { 4, { { 2, 37.5 }, { 5, 37.5 } } },
-                           { 6, { { 2, 37.4 }, { 5, 37.4 } } } } },
+                           { 6, { { 2, 37.3 }, { 5, 37.3 } } } } },
                        { static_cast<int>(::libaom_test::kAllIntra),
                          { { 3, { { 2, 42.2 }, { 5, 42.2 } } },
                            { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
-- 
GitLab


From 49e5deed0c0634df5b27feca64efc0fe5e01c22f Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 2 Jul 2024 16:02:26 -0700
Subject: [PATCH 270/391] List standard C headers in alphabetical order

Change-Id: I8a3f329ddafd61c0fbde88143e00931c6f1399e2
---
 stats/rate_hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stats/rate_hist.c b/stats/rate_hist.c
index 32872465fe..ff320bf905 100644
--- a/stats/rate_hist.c
+++ b/stats/rate_hist.c
@@ -12,8 +12,8 @@
 #include "stats/rate_hist.h"
 
 #include <assert.h>
-#include <math.h>
 #include <limits.h>
+#include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-- 
GitLab


From 6fb5bfdf30282eebf47db5ca6f0dfecedcd0853c Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Fri, 28 Jun 2024 09:56:36 -0700
Subject: [PATCH 271/391] rtc: Add get control for high_motion_content_screen

Change-Id: Ibf160f7774e748f8b9cfc292ccb1c2d255989054
---
 aom/aomcx.h                 | 10 ++++++++++
 av1/av1_cx_iface.c          | 11 +++++++++++
 examples/svc_encoder_rtc.cc |  7 +++++++
 3 files changed, 28 insertions(+)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 835eab84de..594d463a34 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1547,6 +1547,13 @@ enum aome_enc_control_id {
    */
   AV1E_SET_AUTO_TILES = 166,
 
+  /*!\brief Codec control to get the high motion screen content flag.
+   * int * parameter.
+   * Returns an integer.
+   * 1 means high motion screen content, 0 means not.
+   */
+  AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC = 167,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -2207,6 +2214,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_AUTO_TILES, unsigned int)
 #define AOM_CTRL_AV1E_SET_AUTO_TILES
 
+AOM_CTRL_USE_TYPE(AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC, int *)
+#define AOM_CTRL_AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index a03fed9fc2..403cb83bd7 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -4435,6 +4435,15 @@ static aom_codec_err_t ctrl_get_luma_cdef_strength(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_get_high_motion_content_screen_rtc(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  int *arg = va_arg(args, int *);
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = cpi->rc.high_motion_screen_content;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -4598,6 +4607,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_GET_TARGET_SEQ_LEVEL_IDX, ctrl_get_target_seq_level_idx },
   { AV1E_GET_NUM_OPERATING_POINTS, ctrl_get_num_operating_points },
   { AV1E_GET_LUMA_CDEF_STRENGTH, ctrl_get_luma_cdef_strength },
+  { AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC,
+    ctrl_get_high_motion_content_screen_rtc },
 
   CTRL_MAP_END,
 };
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index c9ff20cdf3..0e9eba9c53 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -1915,6 +1915,13 @@ int main(int argc, const char **argv) {
       cx_time_layer[layer] += aom_usec_timer_elapsed(&timer);
       frame_cnt_layer[layer] += 1;
 
+      // Get the high motion content flag.
+      int content_flag = 0;
+      if (aom_codec_control(&codec, AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC,
+                            &content_flag)) {
+        die_codec(&codec, "Failed to GET_HIGH_MOTION_CONTENT_SCREEN_RTC");
+      }
+
       got_data = 0;
       // For simulcast (mode 11): write out each spatial layer to the file.
       int ss_layers_write = (app_input.layering_mode == 11)
-- 
GitLab


From 9a2f746aef90b2899f09a87b3dc12adcecda8f86 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 1 Jul 2024 14:53:13 -0700
Subject: [PATCH 272/391] rtc: Allow QP to react faster for larger overshoot

Increase the max_delta_up when overshoot is detected.
This reduces overshoot in static clips with scene change
fading in/out.

Neutral/negligible stats change on rtc_derf/rtc/rtc_screen.

Change-Id: I86bf7c67e6438e60bd828775c60248abd571579f
---
 av1/encoder/ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 744f5f5cdc..9793a3f847 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -551,7 +551,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
       p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
       rc->frames_since_key > 4;
   int max_delta_down;
-  int max_delta_up = overshoot_buffer_low ? 60 : 20;
+  int max_delta_up = overshoot_buffer_low ? 120 : 20;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
       0.1 * (rc->avg_frame_bandwidth);
@@ -571,7 +571,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
       // Link max_delta_up to max_delta_down and buffer status.
       if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
         max_delta_up = AOMMAX(4, max_delta_down);
-      } else {
+      } else if (!overshoot_buffer_low) {
         max_delta_up = AOMMAX(8, max_delta_down);
       }
     }
-- 
GitLab


From 0cd6d3ae65403c32a614a4707fec00e90784b3c8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 9 Jul 2024 16:47:02 -0700
Subject: [PATCH 273/391] Do not include config/aom_version.h

Replace the VERSION_STRING_NOSP macro by the public API function
aom_codec_version_str().

Treat config/aom_version.h as an absolutely internal header of the
libaom library.

Change-Id: Ic26635c11a84c88debac69ca564c9a7cd4294f53
---
 test/decode_perf_test.cc | 7 +++----
 test/encode_perf_test.cc | 8 +++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index ab80dbc264..14b0f9e0c0 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -12,8 +12,7 @@
 #include <string>
 #include <tuple>
 
-#include "config/aom_version.h"
-
+#include "aom/aom_codec.h"
 #include "aom_ports/aom_timer.h"
 #include "common/ivfenc.h"
 #include "test/codec_factory.h"
@@ -82,7 +81,7 @@ TEST_P(DecodePerfTest, PerfTest) {
 
   printf("{\n");
   printf("\t\"type\" : \"decode_perf_test\",\n");
-  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"version\" : \"%s\",\n", aom_codec_version_str());
   printf("\t\"videoName\" : \"%s\",\n", video_name);
   printf("\t\"threadCount\" : %u,\n", threads);
   printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
@@ -232,7 +231,7 @@ TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) {
 
   printf("{\n");
   printf("\t\"type\" : \"decode_perf_test\",\n");
-  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"version\" : \"%s\",\n", aom_codec_version_str());
   printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
   printf("\t\"threadCount\" : %u,\n", threads);
   printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 9ee7ab05d5..ba78a98c74 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -12,15 +12,13 @@
 #include <string>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "config/aom_config.h"
-#include "config/aom_version.h"
-
+#include "aom/aom_codec.h"
+#include "aom_ports/aom_timer.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "aom_ports/aom_timer.h"
 
 namespace {
 
@@ -164,7 +162,7 @@ TEST_P(AV1EncodePerfTest, PerfTest) {
 
         printf("{\n");
         printf("\t\"type\" : \"encode_perf_test\",\n");
-        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"version\" : \"%s\",\n", aom_codec_version_str());
         printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
         printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
         printf("\t\"totalFrames\" : %u,\n", frames);
-- 
GitLab


From 538cf8baf062a30e3554ebda3b18c6fa79af96f3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 9 Jul 2024 17:02:41 -0700
Subject: [PATCH 274/391] av1_rtcd_defs.pl: fix bug reference in TODO

Bug: aomedia:349455146
Change-Id: Icba1e4716298e84da8162d2f550f27b15c4bb294
---
 av1/common/av1_rtcd_defs.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 0dde8b941b..551543038b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -534,7 +534,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  # TODO(aomedia:349450845): enable NEON for armv7 after SIGBUS is fixed.
+  # TODO(aomedia:349455146): enable NEON for armv7 after SIGBUS is fixed.
   if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
     specialize qw/av1_highbd_warp_affine sse4_1 avx2 sve/;
   } else {
-- 
GitLab


From 174ce75a92730182bfaa8ecf23f7969ad123e3ab Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 5 Jul 2024 23:28:42 +0100
Subject: [PATCH 275/391] Fix SIGBUS in
 av1_estimate_noise_from_single_plane_neon

Use the dedicated helper for unaligned loads and re-enable the function
for 32-bit Arm.

Bug: aomedia:349450845
Change-Id: If23b136629007a7f4a2065d713d8c5348a389d35
---
 av1/common/av1_rtcd_defs.pl            |  7 +------
 av1/encoder/arm/temporal_filter_neon.c | 21 +++++++++------------
 test/temporal_filter_test.cc           |  3 ---
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 551543038b..284f0efe9f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -386,12 +386,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/av1_apply_temporal_filter sse2 avx2 neon neon_dotprod/;
 
     add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
-    # TODO(aomedia:349450845): enable NEON for armv7 after SIGBUS is fixed.
-    if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-      specialize qw/av1_estimate_noise_from_single_plane avx2/;
-    } else {
-      specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
-    }
+    specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index dffa36898a..08746b5a9b 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -283,8 +283,6 @@ void av1_apply_temporal_filter_neon(
   }
 }
 
-// TODO(aomedia:349450845): enable for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
                                                  int width, int stride,
                                                  int edge_thresh) {
@@ -458,15 +456,15 @@ double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
     if (w <= (width - 1) - 4) {
       uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
       uint8x8_t mat[3][3];
-      mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
-      mat[0][1] = load_u8_4x1(src_ptr - stride);
-      mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
-      mat[1][0] = load_u8_4x1(src_ptr - 1);
-      mat[1][1] = load_u8_4x1(src_ptr);
-      mat[1][2] = load_u8_4x1(src_ptr + 1);
-      mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
-      mat[2][1] = load_u8_4x1(src_ptr + stride);
-      mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+      mat[0][0] = load_unaligned_u8_4x1(src_ptr - stride - 1);
+      mat[0][1] = load_unaligned_u8_4x1(src_ptr - stride);
+      mat[0][2] = load_unaligned_u8_4x1(src_ptr - stride + 1);
+      mat[1][0] = load_unaligned_u8_4x1(src_ptr - 1);
+      mat[1][1] = load_unaligned_u8_4x1(src_ptr);
+      mat[1][2] = load_unaligned_u8_4x1(src_ptr + 1);
+      mat[2][0] = load_unaligned_u8_4x1(src_ptr + stride - 1);
+      mat[2][1] = load_unaligned_u8_4x1(src_ptr + stride);
+      mat[2][2] = load_unaligned_u8_4x1(src_ptr + stride + 1);
 
       // Compute Sobel gradients.
       uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
@@ -548,4 +546,3 @@ double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
              ? -1.0
              : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
 }
-#endif  // AOM_ARCH_AARCH64
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index be11e2c63e..52e2366aa8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -413,15 +413,12 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-// TODO(aomedia:349450845): enable for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, EstimateNoiseTest,
     ::testing::Combine(
         ::testing::Values(av1_estimate_noise_from_single_plane_c),
         ::testing::Values(av1_estimate_noise_from_single_plane_neon),
         ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
-#endif  // AOM_ARCH_AARCH64
 #endif  // HAVE_NEON
 
 #if CONFIG_AV1_HIGHBITDEPTH
-- 
GitLab


From 6642f59795e2cf8e468b3a41c05439d2498ab41a Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 9 Jul 2024 11:56:42 -0700
Subject: [PATCH 276/391] Fix to QP for temporal enhancment after keyframe

For temporal enhancement layers in RTC mode:
after a key frame use the avg_qindex_key for the
ambient_qp for the active_worst_quality setting.
And use last_q[KEY/INTER_FRAME] depending on keyframe
distance in adjust_q_cbr().

This reduces some sawtooth-like behavoior in psnr/qp
observed for temporal layers with periodic keyframes.

This also exposed a bug in the external RC where
frames_since_key was not updated in the correct place.

Change-Id: Ia63819a98823b13aefdde79b934fb30b78953986
---
 av1/encoder/ratectrl.c | 22 ++++++++++++++++------
 av1/ratectrl_rtc.cc    |  8 ++++----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 9793a3f847..784733d47b 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -653,9 +653,13 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the
       // last TL0 frame.
+      const int last_qindex_tl0 =
+          rc->frames_since_key < svc->number_temporal_layers
+              ? lc->p_rc.last_q[KEY_FRAME]
+              : lc->p_rc.last_q[INTER_FRAME];
       if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth &&
-          q < lc->p_rc.last_q[INTER_FRAME] - 4)
-        q = lc->p_rc.last_q[INTER_FRAME] - 4;
+          q < last_qindex_tl0 - 4)
+        q = last_qindex_tl0 - 4;
     } else if (cpi->svc.temporal_layer_id == 0 &&
                p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
                rc->frame_source_sad < 100000) {
@@ -1123,7 +1127,7 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
   int adjustment = 0;
   int active_worst_quality;
   int ambient_qp;
-  if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality;
+  if (frame_is_intra_only(cm)) return rc->worst_quality;
   // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -1138,9 +1142,15 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
     avg_qindex_key =
         AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
   }
-  ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
-                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
-                   : p_rc->avg_frame_qindex[INTER_FRAME];
+  if (svc->temporal_layer_id > 0 &&
+      rc->frames_since_key < 2 * svc->number_temporal_layers) {
+    ambient_qp = avg_qindex_key;
+  } else {
+    ambient_qp =
+        (cm->current_frame.frame_number < num_frames_weight_key)
+            ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+            : p_rc->avg_frame_qindex[INTER_FRAME];
+  }
   ambient_qp = AOMMIN(rc->worst_quality, ambient_qp);
 
   if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index f8c13c68c0..eab3d1de37 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -283,8 +283,6 @@ FrameDropDecision AV1RateControlRTC::ComputeQP(
       cpi_->svc.layer_context[layer].is_key_frame = 0;
     }
   }
-  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
-    cpi_->rc.frames_since_key++;
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1) {
     av1_update_temporal_layer_framerate(cpi_);
@@ -364,9 +362,11 @@ bool AV1RateControlRTC::GetSegmentationData(
 
 void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   cpi_->common.current_frame.frame_number++;
-  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
-    cpi_->svc.prev_number_spatial_layers = cpi_->svc.number_spatial_layers;
   av1_rc_postencode_update(cpi_, encoded_frame_size);
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) {
+    cpi_->svc.prev_number_spatial_layers = cpi_->svc.number_spatial_layers;
+    cpi_->rc.frames_since_key++;
+  }
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
     av1_save_layer_context(cpi_);
-- 
GitLab


From 3a5d8b88cb18b17ca336361842e083d88aa39c32 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 10 Jul 2024 11:46:04 -0700
Subject: [PATCH 277/391] rtc: Add comment and rename flag for
 high_motion_content

Change-Id: I19a85277def10f457816501aba894beca9f6cf70
---
 aom/aomcx.h                    |  6 +++---
 av1/av1_cx_iface.c             |  2 +-
 av1/encoder/aq_cyclicrefresh.c |  2 +-
 av1/encoder/ratectrl.c         | 11 ++++++-----
 av1/encoder/ratectrl.h         |  2 +-
 av1/encoder/speed_features.c   |  2 +-
 av1/encoder/var_based_part.c   |  2 +-
 7 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 594d463a34..e88e8494d9 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1547,10 +1547,10 @@ enum aome_enc_control_id {
    */
   AV1E_SET_AUTO_TILES = 166,
 
-  /*!\brief Codec control to get the high motion screen content flag.
-   * int * parameter.
+  /*!\brief Codec control to get the high motion content flag, used for
+   * screen content realtime (RTC) encoding, int * parameter.
    * Returns an integer.
-   * 1 means high motion screen content, 0 means not.
+   * 1 means high motion content flag is set to 1, 0 means set to 0.
    */
   AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC = 167,
 
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 403cb83bd7..68ffbd18a9 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -4440,7 +4440,7 @@ static aom_codec_err_t ctrl_get_high_motion_content_screen_rtc(
   int *arg = va_arg(args, int *);
   AV1_COMP *const cpi = ctx->ppi->cpi;
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg = cpi->rc.high_motion_screen_content;
+  *arg = cpi->rc.high_motion_content_screen_rtc;
   return AOM_CODEC_OK;
 }
 
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 4d8be31120..e4d942a0fd 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -439,7 +439,7 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
-      cpi->rc.high_motion_screen_content || scene_change_detected ||
+      cpi->rc.high_motion_content_screen_rtc || scene_change_detected ||
       svc->temporal_layer_id > 0 ||
       svc->prev_number_spatial_layers != svc->number_spatial_layers ||
       p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 784733d47b..c5b67ba3b7 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3269,11 +3269,11 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   if (num_samples > 0)
     rc->percent_blocks_with_motion =
         ((num_samples - num_zero_temp_sad) * 100) / num_samples;
-  // Update the high_motion_screen_content flag on TL0. Avoid the update
+  // Update the high_motion_content_screen_rtc flag on TL0. Avoid the update
   // if too many consecutive frame drops occurred.
   const uint64_t thresh_high_motion = 9 * 64 * 64;
   if (cpi->svc.temporal_layer_id == 0 && rc->drop_count_consec < 3) {
-    cpi->rc.high_motion_screen_content = 0;
+    cpi->rc.high_motion_content_screen_rtc = 0;
     if (cpi->oxcf.speed >= 11 &&
         cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
         rc->percent_blocks_with_motion > 40 &&
@@ -3281,7 +3281,7 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
         rc->avg_source_sad > thresh_high_motion &&
         rc->avg_frame_low_motion < 60 && unscaled_src->y_width >= 1280 &&
         unscaled_src->y_height >= 720) {
-      cpi->rc.high_motion_screen_content = 1;
+      cpi->rc.high_motion_content_screen_rtc = 1;
       // Compute fast coarse/global motion for 128x128 superblock centered
       // at middle of frames, to determine if motion is scroll.
       int pos_col = (unscaled_src->y_width >> 1) - 64;
@@ -3295,7 +3295,7 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
           cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128,
           pos_col, pos_row, &best_intmv_col, &best_intmv_row);
       if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
-        cpi->rc.high_motion_screen_content = 0;
+        cpi->rc.high_motion_content_screen_rtc = 0;
     }
     // Pass the flag value to all layer frames.
     if (cpi->svc.number_spatial_layers > 1 ||
@@ -3307,7 +3307,8 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
               LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
           LAYER_CONTEXT *lc = &svc->layer_context[layer];
           RATE_CONTROL *lrc = &lc->rc;
-          lrc->high_motion_screen_content = rc->high_motion_screen_content;
+          lrc->high_motion_content_screen_rtc =
+              rc->high_motion_content_screen_rtc;
         }
       }
     }
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 0a5cfbc178..49a0c64924 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -190,7 +190,7 @@ typedef struct {
   int sframe_due;
 
   int high_source_sad;
-  int high_motion_screen_content;
+  int high_motion_content_screen_rtc;
   uint64_t avg_source_sad;
   uint64_t prev_avg_source_sad;
   uint64_t frame_source_sad;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index e0709b15ad..f27a1c3787 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1608,7 +1608,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
           sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
       }
     }
-    if (speed >= 11 && cpi->rc.high_motion_screen_content) {
+    if (speed >= 11 && cpi->rc.high_motion_content_screen_rtc) {
       sf->rt_sf.higher_thresh_scene_detection = 1;
       sf->rt_sf.force_only_last_ref = 1;
       sf->rt_sf.use_nonrd_filter_search = 0;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 0b449e89d5..8f7924a162 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -649,7 +649,7 @@ static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
       updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
       cpi->ppi->rtc_ref.non_reference_frame);
   if (cpi->oxcf.speed >= 11 && source_sad_nonrd > kLowSad &&
-      cpi->rc.high_motion_screen_content)
+      cpi->rc.high_motion_content_screen_rtc)
     updated_thresh_base = updated_thresh_base << 5;
   return updated_thresh_base;
 }
-- 
GitLab


From 365784fdf03dc60d7ce2b9b9520931b6052ce89a Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 11 Jul 2024 13:55:29 -0700
Subject: [PATCH 278/391] Fix -fsanitize=integer unsigned left shift errors

Fix -fsanitize=integer runtime errors like:
  left shift of 4294961503 by 16 places cannot be represented in type
  'uint32_t' (aka 'unsigned int')

For b of type int16_t, the expression (uint32_t)(uint16_t)(b) guarantees
that the upper 16 bits of the uint32_t value are all zeros. Then the
result of << 16 can be represented in uint32_t.

Change-Id: I3b6a7e2c713c42457c4b5e2d193e2533201d9491
---
 aom_dsp/x86/txfm_common_avx2.h | 2 +-
 aom_dsp/x86/txfm_common_sse2.h | 2 +-
 av1/encoder/x86/pickrst_avx2.c | 2 +-
 av1/encoder/x86/pickrst_sse4.c | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 15403b9612..d4c1bc54ec 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -22,7 +22,7 @@ extern "C" {
 
 static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
   return _mm256_set1_epi32(
-      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
 
 static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
diff --git a/aom_dsp/x86/txfm_common_sse2.h b/aom_dsp/x86/txfm_common_sse2.h
index 75f55aa1fb..96276f470b 100644
--- a/aom_dsp/x86/txfm_common_sse2.h
+++ b/aom_dsp/x86/txfm_common_sse2.h
@@ -17,7 +17,7 @@
 #include "aom_dsp/x86/synonyms.h"
 
 #define pair_set_epi16(a, b) \
-  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
 
 // Reverse the 8 16 bit words in __m128i
 static INLINE __m128i mm_reverse_epi16(const __m128i x) {
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 57f0463ba5..1aefc2bac6 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -1542,7 +1542,7 @@ void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
 
 static INLINE __m256i pair_set_epi16(int a, int b) {
   return _mm256_set1_epi32(
-      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
 
 int64_t av1_lowbd_pixel_proj_error_avx2(
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index d354b3d1b5..2ec8d12ced 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -731,7 +731,8 @@ void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
 }
 
 static INLINE __m128i pair_set_epi16(int a, int b) {
-  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+  return _mm_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
 
 int64_t av1_lowbd_pixel_proj_error_sse4_1(
-- 
GitLab


From 25f0d3d9f1289a04171d47bba48e6e2a72fc4183 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 11 Jul 2024 14:18:22 -0700
Subject: [PATCH 279/391] rtc: Avoid the qp adjustment on TL0 for key frames

The adjustment was meant for delta frames.

Change-Id: I3be09592535943e93cbb36db17d92fe94299ee7d
---
 av1/encoder/ratectrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index c5b67ba3b7..4d9af471dc 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -660,7 +660,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
       if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth &&
           q < last_qindex_tl0 - 4)
         q = last_qindex_tl0 - 4;
-    } else if (cpi->svc.temporal_layer_id == 0 &&
+    } else if (cpi->svc.temporal_layer_id == 0 && !frame_is_intra_only(cm) &&
                p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
                rc->frame_source_sad < 100000) {
       // Push base TL0 Q down if buffer is stable and frame_source_sad
-- 
GitLab


From e1ef9bdd54995ec4170b55533d48a556eea084d0 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 11 Jul 2024 17:44:46 -0700
Subject: [PATCH 280/391] Improve error reporting in encoder_init()

Propogate the error code returned by create_stats_buffer().

Set the error detail message if the av1_create_context_and_bufferpool()
call fails.

Change-Id: I9ec91ac8c874273b8a7839489bf0fe1b5ab008f6
---
 av1/av1_cx_iface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 68ffbd18a9..af5ec42aab 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2863,7 +2863,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
 #if !CONFIG_REALTIME_ONLY
       res = create_stats_buffer(&priv->frame_stats_buffer,
                                 &priv->stats_buf_context, *num_lap_buffers);
-      if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+      if (res != AOM_CODEC_OK) return res;
 
       assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
       int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS);
@@ -2878,6 +2878,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
           priv->ppi, &priv->ppi->parallel_cpi[0], &priv->buffer_pool,
           &priv->oxcf, ENCODE_STAGE, -1);
       if (res != AOM_CODEC_OK) {
+        priv->base.err_detail = "av1_create_context_and_bufferpool() failed";
         return res;
       }
 #if !CONFIG_REALTIME_ONLY
-- 
GitLab


From 21f2430f34de433b5aab55815706f4b28bafcce5 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Fri, 12 Jul 2024 14:57:18 +0100
Subject: [PATCH 281/391] Move max value clamp into Neon convolution kernels

Move the maximum value clamp in high bitdepth Neon convolutions into
the kernel itself. This is already the case for SVE and some other
Neon high bitdepth convolutions.

Change-Id: I9607435ef4f29f4e6c6344d1f313035209a74f7e
---
 av1/common/arm/highbd_convolve_neon.c | 390 ++++++++++++--------------
 1 file changed, 174 insertions(+), 216 deletions(-)

diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index 2c392e381c..f4e770ae87 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -24,7 +24,7 @@
 static INLINE uint16x4_t
 highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
                      const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                     const int16x8_t y_filter) {
+                     const int16x8_t y_filter, const uint16x4_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -36,13 +36,14 @@ highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
 
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t
 highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                      const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                     const int16x8_t y_filter) {
+                     const int16x8_t y_filter, const uint16x8_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -61,17 +62,18 @@ highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_y_sr_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int bd) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)(src_ptr + src_stride);
     uint16_t *d = dst_ptr;
 
@@ -84,18 +86,13 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
 
       uint16x4_t d0 =
-          highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+          highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
       uint16x4_t d1 =
-          highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+          highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
       uint16x4_t d2 =
-          highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+          highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
       uint16x4_t d3 =
-          highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+          highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -109,6 +106,7 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     // Width is a multiple of 8 and height is a multiple of 4.
     do {
       int height = h;
@@ -124,18 +122,13 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
 
         uint16x8_t d0 =
-            highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+            highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
         uint16x8_t d1 =
-            highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+            highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
         uint16x8_t d2 =
-            highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+            highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
         uint16x8_t d3 =
-            highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+            highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -156,10 +149,11 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_y(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+static INLINE uint16x4_t
+highbd_convolve8_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                     const int16x4_t s6, const int16x4_t s7,
+                     const int16x8_t y_filter, const uint16x4_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -172,13 +166,15 @@ static INLINE uint16x4_t highbd_convolve8_4_y(
   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
 
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_y(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+static INLINE uint16x8_t
+highbd_convolve8_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                     const int16x8_t s6, const int16x8_t s7,
+                     const int16x8_t y_filter, const uint16x8_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -200,17 +196,18 @@ static INLINE uint16x8_t highbd_convolve8_8_y(
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_y_sr_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -223,18 +220,13 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
 
       uint16x4_t d0 =
-          highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+          highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
       uint16x4_t d1 =
-          highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+          highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
       uint16x4_t d2 =
-          highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+          highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
       uint16x4_t d3 =
-          highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+          highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -250,6 +242,8 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -264,18 +258,13 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
         uint16x8_t d0 =
-            highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+            highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
         uint16x8_t d1 =
-            highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+            highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
         uint16x8_t d2 =
-            highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
-        uint16x8_t d3 =
-            highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+            highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
+        uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10,
+                                             y_filter, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -302,7 +291,8 @@ static INLINE uint16x4_t highbd_convolve12_4_y(
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
     const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const uint16x4_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -319,7 +309,8 @@ static INLINE uint16x4_t highbd_convolve12_4_y(
   sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
   sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
 
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve12_8_y(
@@ -327,7 +318,8 @@ static INLINE uint16x8_t highbd_convolve12_8_y(
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
     const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const uint16x8_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -357,18 +349,19 @@ static INLINE uint16x8_t highbd_convolve12_8_y(
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_y_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -383,21 +376,16 @@ static INLINE void highbd_convolve_y_sr_12tap_neon(
 
       uint16x4_t d0 =
           highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, y_filter_0_7, y_filter_8_11);
+                                s11, y_filter_0_7, y_filter_8_11, max);
       uint16x4_t d1 =
           highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                s12, y_filter_0_7, y_filter_8_11);
+                                s12, y_filter_0_7, y_filter_8_11, max);
       uint16x4_t d2 =
           highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                s13, y_filter_0_7, y_filter_8_11);
+                                s13, y_filter_0_7, y_filter_8_11, max);
       uint16x4_t d3 =
           highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                                s14, y_filter_0_7, y_filter_8_11);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+                                s14, y_filter_0_7, y_filter_8_11, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -417,6 +405,8 @@ static INLINE void highbd_convolve_y_sr_12tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -433,21 +423,16 @@ static INLINE void highbd_convolve_y_sr_12tap_neon(
 
         uint16x8_t d0 =
             highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                  s11, y_filter_0_7, y_filter_8_11);
+                                  s11, y_filter_0_7, y_filter_8_11, max);
         uint16x8_t d1 =
             highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                  s12, y_filter_0_7, y_filter_8_11);
+                                  s12, y_filter_0_7, y_filter_8_11, max);
         uint16x8_t d2 =
             highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                  s13, y_filter_0_7, y_filter_8_11);
+                                  s13, y_filter_0_7, y_filter_8_11, max);
         uint16x8_t d3 =
             highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                  s13, s14, y_filter_0_7, y_filter_8_11);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+                                  s13, s14, y_filter_0_7, y_filter_8_11, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -507,7 +492,8 @@ void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
 
 static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
                                               const int16x8_t x_filter,
-                                              const int32x4_t offset) {
+                                              const int32x4_t offset,
+                                              const uint16x8_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
@@ -528,8 +514,9 @@ static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_x_sr_6tap_neon(
@@ -559,15 +546,10 @@ static INLINE void highbd_convolve_x_sr_6tap_neon(
       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
                    &s3[4], &s3[5]);
 
-      uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset);
-      uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset);
-      uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset);
-      uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset);
-
-      d0 = vminq_u16(d0, max);
-      d1 = vminq_u16(d1, max);
-      d2 = vminq_u16(d2, max);
-      d3 = vminq_u16(d3, max);
+      uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max);
+      uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max);
+      uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max);
+      uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max);
 
       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -584,19 +566,22 @@ static INLINE void highbd_convolve_x_sr_6tap_neon(
 
 static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
                                               const int16x4_t x_filter,
-                                              const int32x4_t offset) {
+                                              const int32x4_t offset,
+                                              const uint16x4_t max) {
   int32x4_t sum = offset;
   sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
 
-  return vqrshrun_n_s32(sum, FILTER_BITS);
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
                                               const int16x8_t x_filter,
-                                              const int32x4_t offset) {
+                                              const int32x4_t offset,
+                                              const uint16x8_t max) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
 
@@ -620,8 +605,9 @@ static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
@@ -630,11 +616,11 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
                                              const int16_t *x_filter_ptr,
                                              ConvolveParams *conv_params,
                                              int bd) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   // This shim allows to do only one rounding shift instead of two.
   const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     // 4-tap filters are used for blocks having width == 4.
     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
     const int16_t *s = (const int16_t *)(src_ptr + 2);
@@ -647,15 +633,10 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
 
-      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset);
-      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset);
-      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset);
-      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -664,6 +645,7 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
     int height = h;
 
@@ -683,15 +665,10 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
                      &s3[4], &s3[5], &s3[6], &s3[7]);
 
-        uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset);
-        uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset);
-        uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset);
-        uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max);
+        uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max);
+        uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max);
+        uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -709,7 +686,8 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
 static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
                                                const int16x8_t x_filter_0_7,
                                                const int16x4_t x_filter_8_11,
-                                               const int32x4_t offset) {
+                                               const int32x4_t offset,
+                                               const uint16x4_t max) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
 
@@ -727,13 +705,15 @@ static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
   sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
   sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
 
-  return vqrshrun_n_s32(sum, FILTER_BITS);
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
                                                const int16x8_t x_filter_0_7,
                                                const int16x4_t x_filter_8_11,
-                                               const int32x4_t offset) {
+                                               const int32x4_t offset,
+                                               const uint16x8_t max) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
 
@@ -765,21 +745,22 @@ static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
 
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_x_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     int bd) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   // This shim allows to do only one rounding shift instead of two.
   const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -799,18 +780,13 @@ static INLINE void highbd_convolve_x_sr_12tap_neon(
                     &s3[11]);
 
       uint16x4_t d0 =
-          highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset);
+          highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
       uint16x4_t d1 =
-          highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset);
+          highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
       uint16x4_t d2 =
-          highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset);
+          highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
       uint16x4_t d3 =
-          highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+          highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -819,6 +795,7 @@ static INLINE void highbd_convolve_x_sr_12tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     int height = h;
 
     do {
@@ -842,18 +819,13 @@ static INLINE void highbd_convolve_x_sr_12tap_neon(
                       &s3[11]);
 
         uint16x8_t d0 =
-            highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset);
+            highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
         uint16x8_t d1 =
-            highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset);
+            highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
         uint16x8_t d2 =
-            highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset);
+            highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
         uint16x8_t d3 =
-            highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+            highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -904,7 +876,7 @@ static INLINE uint16x4_t highbd_convolve6_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x8_t y_filter, const int32x4_t round_shift,
-    const int32x4_t offset) {
+    const int32x4_t offset, const uint16x4_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -917,14 +889,15 @@ static INLINE uint16x4_t highbd_convolve6_4_2d_v(
   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
 
   sum = vshlq_s32(sum, round_shift);
-  return vqmovun_s32(sum);
+  uint16x4_t res = vqmovun_s32(sum);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve6_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t y_filter, const int32x4_t round_shift,
-    const int32x4_t offset) {
+    const int32x4_t offset, const uint16x8_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -946,20 +919,21 @@ static INLINE uint16x8_t highbd_convolve6_8_2d_v(
   sum0 = vshlq_s32(sum0, round_shift);
   sum1 = vshlq_s32(sum1, round_shift);
 
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     int bd, const int offset) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
   const int round1_shift = conv_params->round_1;
   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
     int16x4_t s0, s1, s2, s3, s4;
@@ -970,19 +944,14 @@ static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
       int16x4_t s5, s6, s7, s8;
       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
 
-      uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
-                                              round1_shift_s32, offset_s32);
-      uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
-                                              round1_shift_s32, offset_s32);
-      uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
-                                              round1_shift_s32, offset_s32);
-      uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
-                                              round1_shift_s32, offset_s32);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+      uint16x4_t d0 = highbd_convolve6_4_2d_v(
+          s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max);
+      uint16x4_t d1 = highbd_convolve6_4_2d_v(
+          s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max);
+      uint16x4_t d2 = highbd_convolve6_4_2d_v(
+          s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max);
+      uint16x4_t d3 = highbd_convolve6_4_2d_v(
+          s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -996,6 +965,8 @@ static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -1008,19 +979,18 @@ static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
         int16x8_t s5, s6, s7, s8;
         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
 
-        uint16x8_t d0 = highbd_convolve6_8_2d_v(
-            s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32);
-        uint16x8_t d1 = highbd_convolve6_8_2d_v(
-            s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32);
-        uint16x8_t d2 = highbd_convolve6_8_2d_v(
-            s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32);
-        uint16x8_t d3 = highbd_convolve6_8_2d_v(
-            s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        uint16x8_t d0 =
+            highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
+                                    round1_shift_s32, offset_s32, max);
+        uint16x8_t d1 =
+            highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
+                                    round1_shift_s32, offset_s32, max);
+        uint16x8_t d2 =
+            highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
+                                    round1_shift_s32, offset_s32, max);
+        uint16x8_t d3 =
+            highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
+                                    round1_shift_s32, offset_s32, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1044,7 +1014,7 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
-    const int32x4_t round_shift, const int32x4_t offset) {
+    const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
 
@@ -1058,14 +1028,15 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(
   sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
 
   sum = vshlq_s32(sum, round_shift);
-  return vqmovun_s32(sum);
+  uint16x4_t res = vqmovun_s32(sum);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve8_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
-    const int32x4_t round_shift, const int32x4_t offset) {
+    const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
 
@@ -1090,20 +1061,21 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
   sum0 = vshlq_s32(sum0, round_shift);
   sum1 = vshlq_s32(sum1, round_shift);
 
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     int bd, const int offset) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
   const int round1_shift = conv_params->round_1;
   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -1117,21 +1089,16 @@ static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
 
       uint16x4_t d0 =
           highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                  round1_shift_s32, offset_s32);
+                                  round1_shift_s32, offset_s32, max);
       uint16x4_t d1 =
           highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                  round1_shift_s32, offset_s32);
+                                  round1_shift_s32, offset_s32, max);
       uint16x4_t d2 =
           highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                  round1_shift_s32, offset_s32);
+                                  round1_shift_s32, offset_s32, max);
       uint16x4_t d3 =
           highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                  round1_shift_s32, offset_s32);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+                                  round1_shift_s32, offset_s32, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1147,6 +1114,8 @@ static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -1162,21 +1131,16 @@ static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
 
         uint16x8_t d0 =
             highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                    round1_shift_s32, offset_s32);
+                                    round1_shift_s32, offset_s32, max);
         uint16x8_t d1 =
             highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                    round1_shift_s32, offset_s32);
+                                    round1_shift_s32, offset_s32, max);
         uint16x8_t d2 =
             highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                    round1_shift_s32, offset_s32);
+                                    round1_shift_s32, offset_s32, max);
         uint16x8_t d3 =
             highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                    round1_shift_s32, offset_s32);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+                                    round1_shift_s32, offset_s32, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1204,7 +1168,7 @@ static INLINE uint16x4_t highbd_convolve12_4_2d_v(
     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
     const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t round_shift, const int32x4_t offset) {
+    const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -1222,7 +1186,8 @@ static INLINE uint16x4_t highbd_convolve12_4_2d_v(
   sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
 
   sum = vshlq_s32(sum, round_shift);
-  return vqmovun_s32(sum);
+  uint16x4_t res = vqmovun_s32(sum);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t highbd_convolve12_8_2d_v(
@@ -1231,7 +1196,7 @@ static INLINE uint16x8_t highbd_convolve12_8_2d_v(
     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
     const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
     const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t round_shift, const int32x4_t offset) {
+    const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -1264,14 +1229,14 @@ static INLINE uint16x8_t highbd_convolve12_8_2d_v(
   sum0 = vshlq_s32(sum0, round_shift);
   sum1 = vshlq_s32(sum1, round_shift);
 
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+  return vminq_u16(res, max);
 }
 
 static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     const int bd, const int offset) {
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
@@ -1279,6 +1244,7 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
 
   if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
@@ -1293,21 +1259,16 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
 
       uint16x4_t d0 = highbd_convolve12_4_2d_v(
           s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32);
+          y_filter_8_11, round1_shift_s32, offset_s32, max);
       uint16x4_t d1 = highbd_convolve12_4_2d_v(
           s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32);
+          y_filter_8_11, round1_shift_s32, offset_s32, max);
       uint16x4_t d2 = highbd_convolve12_4_2d_v(
           s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32);
+          y_filter_8_11, round1_shift_s32, offset_s32, max);
       uint16x4_t d3 = highbd_convolve12_4_2d_v(
           s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32);
-
-      d0 = vmin_u16(d0, vget_low_u16(max));
-      d1 = vmin_u16(d1, vget_low_u16(max));
-      d2 = vmin_u16(d2, vget_low_u16(max));
-      d3 = vmin_u16(d3, vget_low_u16(max));
+          y_filter_8_11, round1_shift_s32, offset_s32, max);
 
       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -1327,6 +1288,8 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
       h -= 4;
     } while (h != 0);
   } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
@@ -1343,21 +1306,16 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
 
         uint16x8_t d0 = highbd_convolve12_8_2d_v(
             s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32);
+            y_filter_8_11, round1_shift_s32, offset_s32, max);
         uint16x8_t d1 = highbd_convolve12_8_2d_v(
             s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32);
+            y_filter_8_11, round1_shift_s32, offset_s32, max);
         uint16x8_t d2 = highbd_convolve12_8_2d_v(
             s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32);
+            y_filter_8_11, round1_shift_s32, offset_s32, max);
         uint16x8_t d3 = highbd_convolve12_8_2d_v(
             s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+            y_filter_8_11, round1_shift_s32, offset_s32, max);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-- 
GitLab


From ff710a53216b2100c2480eff4cb9d7cbc9656a4a Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 11 Jul 2024 16:06:33 -0700
Subject: [PATCH 282/391] Fix integer overflows when encoding a huge frame

Fix signed and unsigned integer overflows when encoding a frame of size
38464 x 28832.

For more information, see
https://github.com/AOMediaCodec/libavif/issues/1111#issuecomment-2218346811.

Change-Id: I78131498a198c5bd157bbb9fcdf5042ae71affc9
---
 av1/av1_cx_iface.c        |  7 +++++--
 av1/encoder/encodeframe.c |  3 ++-
 av1/encoder/encodetxb.c   | 23 +++++++++++++++++++++--
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index af5ec42aab..53e7139bb0 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -3041,8 +3041,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     res = validate_img(ctx, img);
     if (res == AOM_CODEC_OK) {
       const size_t uncompressed_frame_sz =
-          ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
-          ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8;
+          (size_t)((uint64_t)ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
+                   ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) *
+                   get_image_bps(img) / 8);
 
       // Due to the presence of no-show frames, the ctx->cx_data buffer holds
       // compressed data corresponding to multiple frames. As no-show frames are
@@ -3057,6 +3058,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 &&
           !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled)
         multiplier = 2;
+      if (uncompressed_frame_sz > SIZE_MAX / multiplier)
+        return AOM_CODEC_MEM_ERROR;
       size_t data_sz = uncompressed_frame_sz * multiplier;
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index cac8d81516..de09b70887 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2112,7 +2112,8 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
       for (j = TX_TYPES - 1; j >= 0; j--) {
         int update_txtype_frameprobs = 1;
         const int new_prob =
-            sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+            sum ? (int)((int64_t)MAX_TX_TYPE_PROB *
+                        cpi->td.rd_counts.tx_type_used[i][j] / sum)
                 : (j ? 0 : MAX_TX_TYPE_PROB);
 #if CONFIG_FPMT_TEST
         if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index a6452a9bec..32d67463fb 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -11,6 +11,8 @@
 
 #include "av1/encoder/encodetxb.h"
 
+#include <stdint.h>
+
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/idct.h"
@@ -38,20 +40,37 @@ void av1_alloc_txb_buf(AV1_COMP *cpi) {
       1 << num_pels_log2_lookup[cm->seq_params->sb_size];
   const int chroma_max_sb_square =
       luma_max_sb_square >> (subsampling_x + subsampling_y);
-  const int num_tcoeffs =
-      size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+  const int total_max_sb_square =
+      (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+  if ((size_t)size > SIZE_MAX / (size_t)total_max_sb_square) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "A multiplication would overflow size_t");
+  }
+  const size_t num_tcoeffs = (size_t)size * (size_t)total_max_sb_square;
   const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
   CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
                   aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+  if (sizeof(*coeff_buf_pool->tcoeff) > SIZE_MAX / num_tcoeffs) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "A multiplication would overflow size_t");
+  }
   CHECK_MEM_ERROR(
       cm, coeff_buf_pool->tcoeff,
       aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+  if (sizeof(*coeff_buf_pool->eobs) > SIZE_MAX / num_tcoeffs) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "A multiplication would overflow size_t");
+  }
   CHECK_MEM_ERROR(
       cm, coeff_buf_pool->eobs,
       aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size));
+  if (sizeof(*coeff_buf_pool->entropy_ctx) > SIZE_MAX / num_tcoeffs) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "A multiplication would overflow size_t");
+  }
   CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx,
                   aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) *
                              num_tcoeffs / txb_unit_size));
-- 
GitLab


From 55740da9a581c3cf110df16697295b69240a28d0 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sun, 7 Jul 2024 21:46:30 -0700
Subject: [PATCH 283/391] rtc: Add postencode frame drop feature

Allows for dropping a frame after encoding,
based on buffer_level and encoded frame size.
If decision is to drop: a flag is set (rc->force_max_q)
to force next incoming frame to be encoded at max_q.

Only used for RTC and when the frame dropper is on
(drop_frames_water_mark is above 0). Disable usage
for the external RC.

Feature is useful to avoid big overshoot, for example,
when a scene/slide change is not detected.
Feature is off by default, added control to enable it.

Modified an existing test to make the postencode
frame drop trigger more often.

Change-Id: If55df612cf8fec630f753dd11b796a7c9673ddd8
---
 aom/aomcx.h                    | 10 +++++++
 av1/av1_cx_iface.c             | 12 ++++++++
 av1/encoder/encoder.c          |  8 +++++
 av1/encoder/encoder.h          |  9 ++++++
 av1/encoder/encoder_utils.c    |  5 ++++
 av1/encoder/ratectrl.c         | 53 ++++++++++++++++++++++++++++++++++
 av1/encoder/ratectrl.h         | 15 ++++++++++
 av1/encoder/svc_layercontext.c |  2 ++
 examples/svc_encoder_rtc.cc    |  2 ++
 test/datarate_test.h           |  1 +
 test/svc_datarate_test.cc      | 13 +++++----
 11 files changed, 125 insertions(+), 5 deletions(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index e88e8494d9..9466757ee1 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1554,6 +1554,13 @@ enum aome_enc_control_id {
    */
   AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC = 167,
 
+  /*!\brief Codec control to enable postencode frame drop for RTC encoding,
+   * int parameter. Value of 1 means encoder will enable postencode
+   * drop, Default is 0 (not enabled). Postencode drop is only allowed
+   * when frame dropping is enabled (rc_dropframe_thresh > 0).
+   */
+  AV1E_SET_POSTENCODE_DROP_RTC = 168,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -2217,6 +2224,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_AUTO_TILES, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC, int *)
 #define AOM_CTRL_AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC
 
+AOM_CTRL_USE_TYPE(AV1E_SET_POSTENCODE_DROP_RTC, int)
+#define AOM_CTRL_AV1E_SET_POSTENCODE_DROP_RTC
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 53e7139bb0..9a8e3ee0c2 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2701,6 +2701,17 @@ static aom_codec_err_t ctrl_set_auto_tiles(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_postencode_drop_rtc(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
+  int enable_postencode_drop = CAST(AV1E_SET_POSTENCODE_DROP_RTC, args);
+  if (enable_postencode_drop > 1 || enable_postencode_drop < 0)
+    return AOM_CODEC_INVALID_PARAM;
+  cpi->rc.postencode_drop = enable_postencode_drop;
+  return AOM_CODEC_OK;
+}
+
 #if !CONFIG_REALTIME_ONLY
 static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                            STATS_BUFFER_CTX *stats_buf_context,
@@ -4595,6 +4606,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr },
   { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode },
   { AV1E_SET_AUTO_TILES, ctrl_set_auto_tiles },
+  { AV1E_SET_POSTENCODE_DROP_RTC, ctrl_set_postencode_drop_rtc },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f8fca381d9..047c4395a7 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2502,6 +2502,9 @@ static int encode_without_recode(AV1_COMP *cpi) {
                          ? svc->downsample_filter_phase[svc->spatial_layer_id]
                          : 0;
 
+  if (cpi->rc.postencode_drop && allow_postencode_drop_rtc(cpi))
+    av1_save_all_coding_context(cpi);
+
   set_size_independent_vars(cpi);
   av1_setup_frame_size(cpi);
   cm->prev_frame = get_primary_ref_frame_buf(cm);
@@ -3285,6 +3288,11 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
   end_timing(cpi, av1_pack_bitstream_final_time);
 #endif
 
+  if (cpi->rc.postencode_drop && allow_postencode_drop_rtc(cpi) &&
+      av1_postencode_drop_cbr(cpi, size)) {
+    return AOM_CODEC_OK;
+  }
+
   // Compute sse and rate.
   if (sse != NULL) {
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 38fe8a6fa3..8b3ddb5776 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -4107,6 +4107,15 @@ static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
          !cpi->ppi->rtc_ref.set_ref_frame_config;
 }
 
+// Check if postencode drop is allowed.
+static INLINE int allow_postencode_drop_rtc(const AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  return is_one_pass_rt_params(cpi) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+         cpi->oxcf.rc_cfg.drop_frames_water_mark > 0 &&
+         !cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm) &&
+         cpi->svc.spatial_layer_id == 0;
+}
+
 // Function return size of frame stats buffer
 static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 3419c09ca2..64bafe1260 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -561,6 +561,11 @@ void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
   *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
                                 bottom_index, top_index);
 
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->rc.force_max_q) {
+    *q = cpi->rc.worst_quality;
+    cpi->rc.force_max_q = 0;
+  }
+
 #if !CONFIG_REALTIME_ONLY
   if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
       cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 4d9af471dc..13473a36b6 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -442,6 +442,8 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
   rc->frame_level_fast_extra_bits = 0;
   rc->use_external_qp_one_pass = 0;
   rc->percent_blocks_inactive = 0;
+  rc->force_max_q = 0;
+  rc->postencode_drop = 0;
 }
 
 static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level,
@@ -3780,3 +3782,54 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
   }
   return 1;
 }
+
+int av1_postencode_drop_cbr(AV1_COMP *cpi, size_t *size) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  size_t frame_size = *size << 3;
+  const int64_t new_buffer_level =
+      p_rc->buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size;
+  // For now we drop if new buffer level (given the encoded frame size) goes
+  // below 0 and encoded frame size is much larger than per-frame-bandwidth.
+  // If the frame is already labelled as scene change (high_source_sad = 1)
+  // or the QP is close to max, then no need to drop.
+  const int qp_thresh = 3 * (cpi->rc.worst_quality >> 2);
+  if (!cpi->rc.high_source_sad && new_buffer_level < 0 &&
+      frame_size > 8 * (unsigned int)cpi->rc.avg_frame_bandwidth &&
+      cpi->common.quant_params.base_qindex < qp_thresh) {
+    *size = 0;
+    cpi->is_dropped_frame = true;
+    restore_all_coding_context(cpi);
+    av1_rc_postencode_update_drop_frame(cpi);
+    // Force max_q on next fame. Reset some RC parameters.
+    cpi->rc.force_max_q = 1;
+    p_rc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+    p_rc->buffer_level = p_rc->optimal_buffer_level;
+    p_rc->bits_off_target = p_rc->optimal_buffer_level;
+    cpi->rc.rc_1_frame = 0;
+    cpi->rc.rc_2_frame = 0;
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1) {
+      SVC *svc = &cpi->svc;
+      // Postencode drop is only checked on base spatial layer,
+      // for now if max-q is set on base we force it on all layers.
+      for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+          // Force max_q on next fame. Reset some RC parameters.
+          lrc->force_max_q = 1;
+          lp_rc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+          lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+          lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+          lrc->rc_1_frame = 0;
+          lrc->rc_2_frame = 0;
+        }
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 49a0c64924..64f1915c94 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -208,6 +208,8 @@ typedef struct {
   int prev_frame_is_dropped;
   int drop_count_consec;
   int max_consec_drop;
+  int force_max_q;
+  int postencode_drop;
 
   /*!
    * Frame number for encoded frames (non-dropped).
@@ -823,6 +825,19 @@ void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
  */
 int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
 
+/*!\brief Check if frame should be dropped, for RTC mode.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in,out]       size         Size of encoded frame
+ *
+ * \return 1 if frame is to be dropped, 0 otherwise (no drop).
+ * Set cpi->rc.force_max_q if frame is to be dropped, and updates are
+ * made to rate control parameters. *size is set to 0 when this
+ * function returns 1 (frame is dropped).
+ */
+int av1_postencode_drop_cbr(struct AV1_COMP *cpi, size_t *size);
+
 /*!\brief Compute the q_indices for a single frame.
  *
  * Intended to be used with AOM_Q mode.
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index b0df3f447e..1b709c84a3 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -226,6 +226,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
   const int max_consec_drop = cpi->rc.max_consec_drop;
+  const int postencode_drop = cpi->rc.postencode_drop;
   // Restore layer rate control.
   cpi->rc = lc->rc;
   cpi->ppi->p_rc = lc->p_rc;
@@ -240,6 +241,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   cpi->rc.frames_to_key = old_frame_to_key;
   // Reset to value before the layer restore.
   cpi->rc.max_consec_drop = max_consec_drop;
+  cpi->rc.postencode_drop = postencode_drop;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index 0e9eba9c53..d034b1806b 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -1708,6 +1708,8 @@ int main(int argc, const char **argv) {
   aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE,
                     AOM_FULL_SUPERFRAME_DROP);
 
+  aom_codec_control(&codec, AV1E_SET_POSTENCODE_DROP_RTC, 1);
+
   svc_params.number_spatial_layers = ss_number_layers;
   svc_params.number_temporal_layers = ts_number_layers;
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
diff --git a/test/datarate_test.h b/test/datarate_test.h
index af35dff8dc..5b5c45e1a5 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -83,6 +83,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_POSTENCODE_DROP_RTC, 1);
       }
       if (screen_mode_) {
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 2a540e1e2b..633c279f6d 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -129,6 +129,7 @@ class DatarateTestSVC
       if (screen_mode_) {
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
       }
+      encoder->Control(AV1E_SET_POSTENCODE_DROP_RTC, 1);
     }
     if (number_spatial_layers_ == 2) {
       spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
@@ -1009,9 +1010,9 @@ class DatarateTestSVC
   }
 
   virtual void BasicRateTargetingSVC2TL1SLScreenDropFrameTest() {
-    cfg_.rc_buf_initial_sz = 500;
-    cfg_.rc_buf_optimal_sz = 500;
-    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 50;
+    cfg_.rc_buf_optimal_sz = 50;
+    cfg_.rc_buf_sz = 100;
     cfg_.rc_dropframe_thresh = 30;
     cfg_.rc_min_quantizer = 0;
     cfg_.rc_max_quantizer = 52;
@@ -1034,7 +1035,7 @@ class DatarateTestSVC
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.5)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.8)
           << " The datarate for the file is greater than target by too much!";
     }
     // Top temporal layers are non_reference, so exlcude them from
@@ -2473,10 +2474,12 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLScreen) {
 }
 
 // Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
-// for screen mode, with frame dropper on at low bitrates
+// for screen mode, with frame dropper on at low bitrates. Use small
+// values of rc_buf_initial/optimal/sz to trigger postencode frame drop.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
   BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
 }
+
 // Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
 // for screen mode.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
-- 
GitLab


From e756a62b37a32abe4159b66a2380f546b6b95ea6 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 15 Jul 2024 11:38:36 -0700
Subject: [PATCH 284/391] Fix interger overflow in
 _calc_iframe_target_size_one_pass_cbr

Fix same as in libvpx:
vp9:
https://chromium-review.googlesource.com/c/webm/libvpx/+/5545799
vp8:
https://chromium-review.googlesource.com/c/webm/libvpx/+/4793692

Bug: b/352414650
Change-Id: I496418f23439c0d980ee71ab35dd5f6ac3c2a6e6
---
 av1/encoder/ratectrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 13473a36b6..0acbef6a9b 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2786,7 +2786,7 @@ int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
     if (rc->frames_since_key < framerate / 2) {
       kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
     }
-    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+    target = ((int64_t)(16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
   }
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
-- 
GitLab


From 570e8c426637a169e0542a412db6275fdc9ec730 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 15 Jul 2024 10:03:05 -0700
Subject: [PATCH 285/391] Validate uncompressed_frame_sz can hold the result

When calculating uncompressed_frame_sz, we use
ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) and
ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) instead of ctx->cfg.g_w and
ctx->cfg.g_h, so the result may be greater than the sum of the plane
buffer sizes in the input `img`. Therefore we can't use the successful
allocation of the plane buffers in `img` to justify that
uncompressed_frame_sz can be represented in size_t.

Change-Id: I6e434efa1136067c811f5a1876c6d63472f6abc1
---
 av1/av1_cx_iface.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 9a8e3ee0c2..069091179e 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -3051,10 +3051,13 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == AOM_CODEC_OK) {
-      const size_t uncompressed_frame_sz =
-          (size_t)((uint64_t)ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
-                   ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) *
-                   get_image_bps(img) / 8);
+      const uint64_t uncompressed_frame_sz64 =
+          (uint64_t)ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
+          ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8;
+#if UINT64_MAX > SIZE_MAX
+      if (uncompressed_frame_sz64 > SIZE_MAX) return AOM_CODEC_MEM_ERROR;
+#endif
+      const size_t uncompressed_frame_sz = (size_t)uncompressed_frame_sz64;
 
       // Due to the presence of no-show frames, the ctx->cx_data buffer holds
       // compressed data corresponding to multiple frames. As no-show frames are
-- 
GitLab


From b90f90119aa059b1edfbd1270dd82e5ef3955c75 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 16 Jul 2024 16:36:24 -0700
Subject: [PATCH 286/391] Fix overflow in av1_rc_clamp_pframe_target_size()

Port the changes in
https://aomedia-review.googlesource.com/c/aom/+/154784 to
av1_rc_clamp_pframe_target_size().

Port some of the changes in
https://aomedia-review.googlesource.com/c/aom/+/107521 to
av1_calc_pframe_target_size_one_pass_cbr().

The goal is to make sure that all four combinations of
av1_calc_{iframe,pframe}_target_size_one_pass_{vbr,cbr}() are fixed in
the same way.

Change-Id: I355906a07f31b6010c68de44d184632bdfb6c3fe
---
 av1/encoder/ratectrl.c | 29 +++++++++++++++--------------
 av1/encoder/ratectrl.h |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 0acbef6a9b..c4048b6ce6 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -218,10 +218,10 @@ int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q,
                 (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
 
-int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int64_t target,
                                     FRAME_UPDATE_TYPE frame_update_type) {
   const RATE_CONTROL *rc = &cpi->rc;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
   const int min_frame_target =
       AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
   // Clip the frame target to the minimum setup value.
@@ -238,13 +238,13 @@ int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
 
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
-  if (oxcf->rc_cfg.max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
+  if (rc_cfg->max_inter_bitrate_pct) {
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
 
-  return target;
+  return (int)target;
 }
 
 int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) {
@@ -2698,11 +2698,10 @@ int av1_calc_pframe_target_size_one_pass_vbr(
     target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
              (p_rc->baseline_gf_interval + af_ratio - 1);
   }
-  if (target > INT_MAX) target = INT_MAX;
 #else
   target = rc->avg_frame_bandwidth;
 #endif
-  return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+  return av1_rc_clamp_pframe_target_size(cpi, target, frame_update_type);
 }
 
 int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
@@ -2722,16 +2721,17 @@ int av1_calc_pframe_target_size_one_pass_cbr(
   const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
   int min_frame_target =
       AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target;
+  int64_t target;
 
   if (rc_cfg->gf_cbr_boost_pct) {
     const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
     if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
-      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+      target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
                 af_ratio_pct) /
                (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     } else {
-      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+      target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+                100) /
                (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     }
   } else {
@@ -2760,11 +2760,12 @@ int av1_calc_pframe_target_size_one_pass_cbr(
     target += (target * pct_high) / 200;
   }
   if (rc_cfg->max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
-  return AOMMAX(min_frame_target, target);
+  if (target > INT_MAX) target = INT_MAX;
+  return AOMMAX(min_frame_target, (int)target);
 }
 
 int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 64f1915c94..c38d884607 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -694,7 +694,7 @@ int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
 int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
                                     int64_t target);
 int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
-                                    int target, uint8_t frame_update_type);
+                                    int64_t target, uint8_t frame_update_type);
 
 // Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
 // To be precise, 'q_index' is the smallest integer, for which the corresponding
-- 
GitLab


From 8bc0414a1c6eb52ae274e61f95ee92b0f358c1a0 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 15 Jul 2024 15:52:24 -0700
Subject: [PATCH 287/391] Fix keyframe counter update for dropped frames.

The counters are always updated in update_rc_counters(),
for encoded or dropped frames. So the additional update
in postencode_update_drop_frame() is not needed.

Remove the update in postencode_update_drop_frame(). And
this then requires an update in RateControlRTC::ComputeQP(),
to keep external and internal RC the same.

Change-Id: I2f1f67f8aa6586603801d5d65889445e2000d720
---
 av1/encoder/ratectrl.c | 4 ----
 av1/ratectrl_rtc.cc    | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index c4048b6ce6..9750ce911f 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2431,10 +2431,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
   // Update buffer level with zero size, update frame counters, and return.
   update_buffer_level(cpi, 0);
-  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
-    cpi->rc.frames_since_key++;
-    cpi->rc.frames_to_key--;
-  }
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
   cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index eab3d1de37..7f1640a77c 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -309,6 +309,8 @@ FrameDropDecision AV1RateControlRTC::ComputeQP(
       av1_rc_drop_frame(cpi_)) {
     cpi_->is_dropped_frame = true;
     av1_rc_postencode_update_drop_frame(cpi_);
+    if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+      cpi_->rc.frames_since_key++;
     cpi_->frame_index_set.show_frame_count++;
     cpi_->common.current_frame.frame_number++;
     return FrameDropDecision::kDrop;
-- 
GitLab


From 2dd1d96aa2e4cfa1a4ed90fba258ebe9733b8a8e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Jul 2024 15:11:46 -0700
Subject: [PATCH 288/391] README.md: update issue tracker link

https://bugs.chromium.org/p/aomedia/issues/list ->
https://aomedia.issues.chromium.org/

Change-Id: Ib05fb8dc1c127d47ce093b87c21a9d07cd5f6545
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e6e822b8fb..814ee898a5 100644
--- a/README.md
+++ b/README.md
@@ -700,4 +700,4 @@ please email aomediacodec@jointdevelopment.kavi.com for help.
 ## Bug reports {#bug-reports}
 
 Bug reports can be filed in the Alliance for Open Media
-[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
+[issue tracker](https://aomedia.issues.chromium.org/).
-- 
GitLab


From 93b8eee4f428675195c5c76e8da719ff50c2a01c Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 28 Jun 2024 10:39:49 +0100
Subject: [PATCH 289/391] {,highbd_}intrapred_neon.c: Fix unaligned accesses in
 z2 preds

The z2 predictors load 32-bit chunks of data at variable offsets,
however on AArch32 hardware with alignment checks enabled this can cause
alignment faults since the data is not guaranteed to be 32-bit aligned.

To work around this we can make use of the load_unaligned_u8_4x1 in some
cases and in others simply memcpy to avoid making any alignment
guarantees for the data being loaded.

This also reverts commit 5a46d2961fb233c8f099a7bc18a7a54c8883813b,
re-enabling the predictors for 32-bit Arm platforms.

Bug: b/349428506
Change-Id: Ib15a7993e50b4bb6dd2eb4de95fba1dc33d3cd3f
---
 aom_dsp/arm/highbd_intrapred_neon.c | 32 +++++++++------------
 aom_dsp/arm/intrapred_neon.c        |  7 ++---
 aom_dsp/arm/mem_neon.h              | 26 +++++++++++++++++
 av1/common/av1_rtcd_defs.pl         | 14 ++-------
 test/dr_prediction_test.cc          | 44 -----------------------------
 5 files changed, 43 insertions(+), 80 deletions(-)

diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 5e6118dc6f..71d133e814 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -16,6 +16,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_dsp/intrapred_common.h"
@@ -1604,8 +1605,6 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
 }
 #endif  // AOM_ARCH_AARCH64
 
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
     const uint16_t *left, const int16x4_t indices, int n) {
   assert(n > 0);
@@ -1625,13 +1624,13 @@ static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
 
   // At time of writing both Clang and GCC produced better code with these
   // nested if-statements compared to a switch statement with fallthrough.
-  ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
   if (n > 1) {
-    ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
     if (n > 2) {
-      ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+      load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
       if (n > 3) {
-        ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+        load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
       }
     }
   }
@@ -1665,25 +1664,21 @@ static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
 
   // At time of writing both Clang and GCC produced better code with these
   // nested if-statements compared to a switch statement with fallthrough.
-  ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
   if (n > 1) {
-    ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
     if (n > 2) {
-      ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+      load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
       if (n > 3) {
-        ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+        load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
         if (n > 4) {
-          ret1_u32 =
-              vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+          load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
           if (n > 5) {
-            ret1_u32 =
-                vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+            load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
             if (n > 6) {
-              ret1_u32 =
-                  vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+              load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
               if (n > 7) {
-                ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
-                                          ret1_u32, 3);
+                load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
               }
             }
           }
@@ -2475,7 +2470,6 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
   assert(f != NULL);
   f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
 }
-#endif  // AOM_ARCH_AARCH64
 
 // -----------------------------------------------------------------------------
 // Z3
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 561a9f76a1..7fd82a1b82 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1488,8 +1488,6 @@ void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 
 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
 
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 #if !AOM_ARCH_AARCH64
 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -1514,8 +1512,8 @@ static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
     *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
     *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
   } else {
-    *a0_x = load_u8_4x1(above + base_x);
-    *a1_x = load_u8_4x1(above + base_x + 1);
+    *a0_x = load_unaligned_u8_4x1(above + base_x);
+    *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
     *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
   }
 }
@@ -2040,7 +2038,6 @@ void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
       break;
   }
 }
-#endif  // AOM_ARCH_AARCH64
 
 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 41efd03632..9734f8bd52 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -949,6 +949,32 @@ static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
   *s2 = vld1q_s16(s);
 }
 
+#if AOM_ARCH_AARCH64
+#define load_unaligned_u32_2x1_lane(v, p, lane)              \
+  do {                                                       \
+    (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
+  } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane)               \
+  do {                                                        \
+    (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
+  } while (0)
+#else
+#define load_unaligned_u32_2x1_lane(v, p, lane) \
+  do {                                          \
+    uint32_t tmp;                               \
+    memcpy(&tmp, (p), 4);                       \
+    (v) = vset_lane_u32(tmp, (v), (lane));      \
+  } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane) \
+  do {                                          \
+    uint32_t tmp;                               \
+    memcpy(&tmp, (p), 4);                       \
+    (v) = vsetq_lane_u32(tmp, (v), (lane));     \
+  } while (0)
+#endif
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   uint32_t a;
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 284f0efe9f..5233325624 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -115,12 +115,7 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
 specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-# TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
-if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-  specialize qw/av1_dr_prediction_z2 sse4_1 avx2/;
-} else {
-  specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
-}
+specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
 
@@ -230,12 +225,7 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/;
   add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-  # TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
-  if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-    specialize qw/av1_highbd_dr_prediction_z2 avx2/;
-  } else {
-    specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
-  }
+  specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
   add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/;
 }
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 20cf600320..0938a3db11 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -484,7 +484,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, LowbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
@@ -496,21 +495,8 @@ INSTANTIATE_TEST_SUITE_P(
                       DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
                                          &z3_wrapper<av1_dr_prediction_z3_neon>,
                                          AOM_BITS_8, kZ3Start)));
-#else
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
-    NEON, LowbdDrPredTest,
-    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
-                                         &z1_wrapper<av1_dr_prediction_z1_neon>,
-                                         AOM_BITS_8, kZ1Start),
-                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
-                                         &z3_wrapper<av1_dr_prediction_z3_neon>,
-                                         AOM_BITS_8, kZ3Start)));
-#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, HighbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred_Hbd>(
@@ -549,36 +535,6 @@ INSTANTIATE_TEST_SUITE_P(
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
                           AOM_BITS_12, kZ3Start)));
-#else   // !AOM_ARCH_AARCH64
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
-    NEON, HighbdDrPredTest,
-    ::testing::Values(DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_8, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_10, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_12, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_8, kZ3Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_10, kZ3Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_12, kZ3Start)));
-#endif  // AOM_ARCH_AARCH64
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // HAVE_NEON
-- 
GitLab


From 7a5354da1fdf4fcac04da476fd3b2bc1fa25a5e2 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 18 Jul 2024 15:22:02 -0700
Subject: [PATCH 290/391] rtc: Adjust buffer threshold for postencode drop

Make it more aggressive by setting new_buffer_level
threshold to be optimal_buffer_level/4, instead of 0.

Change-Id: I5be92a5b6e07e9168c2f1861fe57ac40936da6b7
---
 av1/encoder/ratectrl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 9750ce911f..10a35ec79d 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3785,12 +3785,13 @@ int av1_postencode_drop_cbr(AV1_COMP *cpi, size_t *size) {
   size_t frame_size = *size << 3;
   const int64_t new_buffer_level =
       p_rc->buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size;
-  // For now we drop if new buffer level (given the encoded frame size) goes
-  // below 0 and encoded frame size is much larger than per-frame-bandwidth.
+  // Drop if new buffer level (given the encoded frame size) goes below a
+  // threshold and encoded frame size is much larger than per-frame-bandwidth.
   // If the frame is already labelled as scene change (high_source_sad = 1)
   // or the QP is close to max, then no need to drop.
   const int qp_thresh = 3 * (cpi->rc.worst_quality >> 2);
-  if (!cpi->rc.high_source_sad && new_buffer_level < 0 &&
+  const int64_t buffer_thresh = p_rc->optimal_buffer_level >> 2;
+  if (!cpi->rc.high_source_sad && new_buffer_level < buffer_thresh &&
       frame_size > 8 * (unsigned int)cpi->rc.avg_frame_bandwidth &&
       cpi->common.quant_params.base_qindex < qp_thresh) {
     *size = 0;
-- 
GitLab


From c4bfd437d52d618ffc5983bd9b02e78e97255835 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 15 Jul 2024 15:48:03 -0700
Subject: [PATCH 291/391] rtc: Adjust RC for keyframes based on source content

For keyframes in rtc: compute spatial metrics
and use it, along with the last keyframe
encoded/target_size to adjust the rc_bits_per_mb.

This feature is used to avoid overshoot when
max_intra_bitrate_pct is enabled.

Feature is off for now.

Change-Id: I0316d9303cb0ada4ed75948bbfc8196218870514
---
 av1/encoder/encoder.c          |   1 +
 av1/encoder/ratectrl.c         | 114 ++++++++++++++++++++++++++++++++-
 av1/encoder/ratectrl.h         |   5 ++
 av1/encoder/speed_features.c   |   1 +
 av1/encoder/speed_features.h   |   3 +
 av1/encoder/svc_layercontext.c |  11 +++-
 6 files changed, 130 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 047c4395a7..352bb64c9a 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4464,6 +4464,7 @@ static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
       cpi->rc.frames_since_key++;
       cpi->rc.frames_to_key--;
       cpi->rc.frames_to_fwd_kf--;
+      cpi->rc.frames_since_scene_change++;
     }
   }
 }
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 10a35ec79d..3e20bb9408 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -179,6 +179,37 @@ int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
 
 static int get_init_ratio(double sse) { return (int)(300000 / sse); }
 
+// Adjustment based on spatial content and last encoded keyframe.
+// Allow for increase in enumerator to reduce overshoot.
+static int adjust_rtc_keyframe(const RATE_CONTROL *rc, int enumerator) {
+  // Don't adjust if most of the image is flat.
+  if (rc->perc_flat_blocks_keyframe > 70) return enumerator;
+  if (rc->last_encoded_size_keyframe == 0 ||
+      rc->frames_since_scene_change < rc->frames_since_key) {
+    // Very first frame, or if scene change happened after last keyframe.
+    if (rc->spatial_variance_keyframe > 1000)
+      return enumerator << 3;
+    else if (rc->spatial_variance_keyframe > 500 &&
+             rc->perc_flat_blocks_keyframe < 10)
+      return enumerator << 2;
+    else if (rc->spatial_variance_keyframe > 400)
+      return enumerator << 1;
+  } else if (rc->frames_since_scene_change >= rc->frames_since_key) {
+    // There was no scene change before previous encoded keyframe, so
+    // use the last_encoded/target_size_keyframe.
+    if (rc->last_encoded_size_keyframe > 4 * rc->last_target_size_keyframe &&
+        rc->spatial_variance_keyframe > 500)
+      return enumerator << 3;
+    else if (rc->last_encoded_size_keyframe >
+                 2 * rc->last_target_size_keyframe &&
+             rc->spatial_variance_keyframe > 200)
+      return enumerator << 2;
+    else if (rc->last_encoded_size_keyframe > rc->last_target_size_keyframe)
+      return enumerator << 1;
+  }
+  return enumerator;
+}
+
 int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
                        double correction_factor, int accurate_estimate) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -200,8 +231,12 @@ int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
                                                    : cpi->rc.bit_est_ratio;
     // Clamp the enumerator to lower the q fluctuations.
     enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000);
+  } else if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type == KEY_FRAME &&
+             cpi->sf.rt_sf.rc_adjust_keyframe && bit_depth == 8 &&
+             cpi->oxcf.rc_cfg.max_intra_bitrate_pct > 0 &&
+             cpi->svc.spatial_layer_id == 0) {
+    enumerator = adjust_rtc_keyframe(&cpi->rc, enumerator);
   }
-
   // q based adjustment to baseline enumerator
   return (int)(enumerator * correction_factor / q);
 }
@@ -444,6 +479,7 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
   rc->percent_blocks_inactive = 0;
   rc->force_max_q = 0;
   rc->postencode_drop = 0;
+  rc->frames_since_scene_change = 0;
 }
 
 static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level,
@@ -2324,6 +2360,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
     p_rc->last_q[KEY_FRAME] = qindex;
     p_rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+    if (cpi->svc.spatial_layer_id == 0) {
+      rc->last_encoded_size_keyframe = rc->projected_frame_size;
+      rc->last_target_size_keyframe = rc->this_frame_target;
+    }
   } else {
     if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         cpi->rc.rtc_external_ratectrl ||
@@ -2412,7 +2452,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
       p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i];
   }
 #endif
-  if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  if (current_frame->frame_type == KEY_FRAME) {
+    rc->frames_since_key = 0;
+    rc->frames_since_scene_change = 0;
+  }
   if (cpi->refresh_frame.golden_frame)
     rc->frame_num_last_gf_refresh = current_frame->frame_number;
   rc->prev_coded_width = cm->width;
@@ -3268,6 +3311,7 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   if (num_samples > 0)
     rc->percent_blocks_with_motion =
         ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  if (rc->high_source_sad) cpi->rc.frames_since_scene_change = 0;
   // Update the high_motion_content_screen_rtc flag on TL0. Avoid the update
   // if too many consecutive frame drops occurred.
   const uint64_t thresh_high_motion = 9 * 64 * 64;
@@ -3330,6 +3374,67 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   }
 }
 
+// This is used as a reference when computing the source variance.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+/*!\brief Compute spatial activity for keyframe,  1 pass real-time mode.
+ *
+ * Compute average spatial activity/variance for source frame over a
+ * subset of superblocks.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       src_y        Input source buffer for y channel.
+ * \param[in]       src_ystride  Input source stride for y channel.
+ *
+ * \remark Nothing is returned. Instead the average spatial variance
+ * computed is stored in flag \c cpi->rc.spatial_variance_keyframe.
+ */
+static void rc_spatial_act_keyframe_onepass_rt(AV1_COMP *cpi, uint8_t *src_y,
+                                               int src_ystride) {
+  AV1_COMMON *const cm = &cpi->common;
+  int num_mi_cols = cm->mi_params.mi_cols;
+  int num_mi_rows = cm->mi_params.mi_rows;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  // Loop over sub-sample of frame, compute average over 64x64 blocks.
+  uint64_t avg_variance = 0;
+  int num_samples = 0;
+  int num_zero_var_blocks = 0;
+  cpi->rc.perc_flat_blocks_keyframe = 0;
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+  for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+    for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+      unsigned int sse;
+      const unsigned int var =
+          cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, AV1_VAR_OFFS, 0, &sse);
+      avg_variance += var;
+      num_samples++;
+      if (var == 0) num_zero_var_blocks++;
+      src_y += 64;
+    }
+    src_y += (src_ystride << 6) - (sb_cols << 6);
+  }
+  if (num_samples > 0) {
+    cpi->rc.perc_flat_blocks_keyframe = 100 * num_zero_var_blocks / num_samples;
+    avg_variance = avg_variance / num_samples;
+  }
+  cpi->rc.spatial_variance_keyframe = avg_variance >> 12;
+}
+
 /*!\brief Set the GF baseline interval for 1 pass real-time mode.
  *
  *
@@ -3640,6 +3745,11 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
       cpi->src_sad_blk_64x64 = NULL;
     }
   }
+  if (*frame_type == KEY_FRAME && cpi->sf.rt_sf.rc_adjust_keyframe &&
+      svc->spatial_layer_id == 0 && cm->seq_params->bit_depth == 8 &&
+      cpi->oxcf.rc_cfg.max_intra_bitrate_pct > 0)
+    rc_spatial_act_keyframe_onepass_rt(cpi, frame_input->source->y_buffer,
+                                       frame_input->source->y_stride);
   // Check for dynamic resize, for single spatial layer for now.
   // For temporal layers only check on base temporal layer.
   if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index c38d884607..69aad47201 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -194,6 +194,11 @@ typedef struct {
   uint64_t avg_source_sad;
   uint64_t prev_avg_source_sad;
   uint64_t frame_source_sad;
+  uint64_t spatial_variance_keyframe;
+  int last_encoded_size_keyframe;
+  int last_target_size_keyframe;
+  int frames_since_scene_change;
+  int perc_flat_blocks_keyframe;
 
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index f27a1c3787..82d8fdc908 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -2273,6 +2273,7 @@ static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->source_metrics_sb_nonrd = 0;
   rt_sf->overshoot_detection_cbr = NO_DETECTION;
   rt_sf->check_scene_detection = 0;
+  rt_sf->rc_adjust_keyframe = 0;
   rt_sf->prefer_large_partition_blocks = 0;
   rt_sf->use_temporal_noise_estimate = 0;
   rt_sf->fullpel_search_step_param = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e81891447b..5aec86f028 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1648,6 +1648,9 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // Check for scene/content change detection on every frame before encoding.
   int check_scene_detection;
 
+  // For keyframes in rtc: adjust the rc_bits_per_mb, to reduce overshoot.
+  int rc_adjust_keyframe;
+
   // For nonrd mode: Prefer larger partition blks in variance based partitioning
   // 0: disabled, 1-3: increasing aggressiveness
   int prefer_large_partition_blocks;
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 1b709c84a3..e8856986f3 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -225,6 +225,9 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
+  const int frames_since_scene_change = cpi->rc.frames_since_scene_change;
+  const int last_encoded_size_keyframe = cpi->rc.last_encoded_size_keyframe;
+  const int last_target_size_keyframe = cpi->rc.last_target_size_keyframe;
   const int max_consec_drop = cpi->rc.max_consec_drop;
   const int postencode_drop = cpi->rc.postencode_drop;
   // Restore layer rate control.
@@ -235,11 +238,13 @@ void av1_restore_layer_context(AV1_COMP *const cpi) {
   cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
   if (cpi->mv_search_params.max_mv_magnitude == 0)
     cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
-  // Reset the frames_since_key and frames_to_key counters to their values
-  // before the layer restore. Keep these defined for the stream (not layer).
+  // Reset the following parameters to their values before
+  // the layer restore. Keep these defined for the stream (not layer).
   cpi->rc.frames_since_key = old_frame_since_key;
   cpi->rc.frames_to_key = old_frame_to_key;
-  // Reset to value before the layer restore.
+  cpi->rc.frames_since_scene_change = frames_since_scene_change;
+  cpi->rc.last_encoded_size_keyframe = last_encoded_size_keyframe;
+  cpi->rc.last_target_size_keyframe = last_target_size_keyframe;
   cpi->rc.max_consec_drop = max_consec_drop;
   cpi->rc.postencode_drop = postencode_drop;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
-- 
GitLab


From 8bb4ee31e7f817141939000426813acb2b065e36 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Tue, 23 Jul 2024 10:06:57 +0100
Subject: [PATCH 292/391] Fix SIGBUS in av1_highbd_warp_affine_neon

When compiling for Armv7 LLVM generates an incorrect alignment hint on a
call to the vld1q_u16_x2 intrinsic, causing a bus error when alignment
checks are enabled. Replace this instrinsic with a pair of vld1q_u16,
which doesn't generate the bad alignment hint.

This also reverts commit 8f8e1e1bdd169997e2e808cad9cdc5bfa0e0b4fe,
re-enabling the function for armv7 targets.

Bug: b/349455146
Change-Id: Iec5e31d60ec86cd39b177a18c52c8744d0f4ca01
---
 av1/av1.cmake                           |  6 +-----
 av1/common/arm/highbd_warp_plane_neon.h | 14 ++++++++++++--
 av1/common/av1_rtcd_defs.pl             |  7 +------
 test/warp_filter_test.cc                |  5 -----
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 6713f14626..bed6ab9220 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -487,12 +487,8 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c"
               "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
-  # TODO(aomedia:349455146): enable this for armv7 after SIGBUS is fixed.
-  if(AOM_ARCH_AARCH64)
-    list(APPEND AOM_AV1_COMMON_INTRIN_NEON
-                "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c")
-  endif()
 
   list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
               "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c"
diff --git a/av1/common/arm/highbd_warp_plane_neon.h b/av1/common/arm/highbd_warp_plane_neon.h
index b90213d2b2..766abffff0 100644
--- a/av1/common/arm/highbd_warp_plane_neon.h
+++ b/av1/common/arm/highbd_warp_plane_neon.h
@@ -165,7 +165,12 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref,
     if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) {         \
       for (int k = 0; k < 15; ++k) {                                       \
         const int iy = clamp(iy4 + k - 7, 0, height - 1);                  \
-        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);    \
+        const uint16_t *idx = ref + iy * stride + ix4 - 7;                 \
+        /* We don't use vld1q_u16_x2 here as LLVM generates an incorrect   \
+         * alignment hint for this intrinsic that causes a SIGBUS on Armv7 \
+         * targets when alignment checks are enabled.                      \
+         * (See bug: b/349455146) */                                       \
+        uint16x8x2_t src_1 = { { vld1q_u16(idx), vld1q_u16(idx + 8) } };   \
         src_1 = clamp_horizontal(src_1, out_of_boundary_left,              \
                                  out_of_boundary_right, ref, iy, stride,   \
                                  width, indx0, indx1);                     \
@@ -197,7 +202,12 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref,
     if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) {          \
       for (int k = 0; k < 15; ++k) {                                        \
         const int iy = clamp(iy4 + k - 7, 0, height - 1);                   \
-        uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7);     \
+        const uint16_t *idx = ref + iy * stride + ix4 - 7;                  \
+        /* We don't use vld1q_u16_x2 here as LLVM generates an incorrect    \
+         * alignment hint for this intrinsic that causes a SIGBUS on Armv7  \
+         * targets when alignment checks are enabled.                       \
+         * (See bug: b/349455146) */                                        \
+        uint16x8x2_t src_1 = { { vld1q_u16(idx), vld1q_u16(idx + 8) } };    \
         src_1 = clamp_horizontal(src_1, out_of_boundary_left,               \
                                  out_of_boundary_right, ref, iy, stride,    \
                                  width, indx0, indx1);                      \
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 5233325624..1963751fab 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -519,12 +519,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  # TODO(aomedia:349455146): enable NEON for armv7 after SIGBUS is fixed.
-  if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-    specialize qw/av1_highbd_warp_affine sse4_1 avx2 sve/;
-  } else {
-    specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
-  }
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
 add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 56e1022dd6..bade6799b9 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -72,14 +72,9 @@ INSTANTIATE_TEST_SUITE_P(
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#if AOM_ARCH_AARCH64
-// TODO(aomedia:349455146): enable for armv7 after SIGBUS is fixed.
 INSTANTIATE_TEST_SUITE_P(
     NEON, AV1HighbdWarpFilterTest,
     libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_neon));
-#else   // !AOM_ARCH_AARCH64
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdWarpFilterTest);
-#endif  // AOM_ARCH_AARCH64
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
-- 
GitLab


From 125fab09fced31c732b9a157ec3259755c59d3a9 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 25 Jul 2024 14:16:25 -0700
Subject: [PATCH 293/391] rtc: Adjust condition in rc_bits_per_mb on keyframe

For RTC when the feature sf->rc_adjust_keyframe is used:
use flat_blocks_keyframe for more aggressive setting of
enumerator, to reduece overshoot on first keyframes or
keyframes after scene change.

Change-Id: I79dc6786fbe794188b13fe74cd857961a1540529
---
 av1/encoder/ratectrl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 3e20bb9408..b9b7e28561 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -187,7 +187,9 @@ static int adjust_rtc_keyframe(const RATE_CONTROL *rc, int enumerator) {
   if (rc->last_encoded_size_keyframe == 0 ||
       rc->frames_since_scene_change < rc->frames_since_key) {
     // Very first frame, or if scene change happened after last keyframe.
-    if (rc->spatial_variance_keyframe > 1000)
+    if (rc->spatial_variance_keyframe > 1000 ||
+        (rc->spatial_variance_keyframe > 500 &&
+         rc->perc_flat_blocks_keyframe == 0))
       return enumerator << 3;
     else if (rc->spatial_variance_keyframe > 500 &&
              rc->perc_flat_blocks_keyframe < 10)
-- 
GitLab


From 74c99dfe6d63535b9a033322c85581391eba913c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 29 Jul 2024 10:18:16 -0700
Subject: [PATCH 294/391] Do not add -lm to Libs.private if WIN32 or APPLE

In Windows and Apple operating systems, the functions declared in
<math.h> are defined in the main standard C library rather than a
separate libm. Do not add -lm to the Libs.private field in aom.pc if
WIN32 or APPLE is true.

This condition matches the condition under which we link the aom and
aom_av1_rc libraries with -lm in CMakeLists.txt.

Bug: 356153293
Change-Id: Ic42c84b0583aafa4491e393d691939c293415986
---
 build/cmake/pkg_config.cmake | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/build/cmake/pkg_config.cmake b/build/cmake/pkg_config.cmake
index 424b91119d..7fb94e7241 100644
--- a/build/cmake/pkg_config.cmake
+++ b/build/cmake/pkg_config.cmake
@@ -60,10 +60,11 @@ if(CONFIG_TUNE_BUTTERAUGLI)
 endif()
 file(APPEND "${pkgconfig_file}" "\nConflicts:\n")
 file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
+file(APPEND "${pkgconfig_file}" "Libs.private:")
+if(NOT WIN32 AND NOT APPLE)
+  file(APPEND "${pkgconfig_file}" " -lm")
+endif()
 if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT)
-  file(APPEND "${pkgconfig_file}"
-       "Libs.private: -lm ${CMAKE_THREAD_LIBS_INIT}\n")
-else()
-  file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
+  file(APPEND "${pkgconfig_file}" " ${CMAKE_THREAD_LIBS_INIT}")
 endif()
-file(APPEND "${pkgconfig_file}" "Cflags: -I\${includedir}\n")
+file(APPEND "${pkgconfig_file}" "\nCflags: -I\${includedir}\n")
-- 
GitLab


From 593a7f3ac66bca1d46e142af84b0e8ae55a44356 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 26 Jul 2024 13:32:28 -0700
Subject: [PATCH 295/391] Fix double linking with -lvmaf

Also make the same changes for libjxl and libhwy.

In the linker command line for libaom.so.3.9.1, I see
  -L/home/wtc/tmp/prefix/lib/x86_64-linux-gnu  -lvmaf  -lvmaf
So we link with -lvmaf twice.

The relevant variables in CMakeCache.txt are:
  VMAF_LDFLAGS:INTERNAL=-L/home/wtc/tmp/prefix/lib/x86_64-linux-gnu;-lvmaf
  VMAF_LDFLAGS_OTHER:INTERNAL=
  VMAF_LIBDIR:INTERNAL=/home/wtc/tmp/prefix/lib/x86_64-linux-gnu
  VMAF_LIBRARIES:INTERNAL=vmaf
  VMAF_LIBRARY_DIRS:INTERNAL=/home/wtc/tmp/prefix/lib/x86_64-linux-gnu

So passing ${VMAF_LDFLAGS} to target_link_libraries() should suffice. It
is not necessary to also pass ${VMAF_LIBRARIES}.

Alternatively, we could pass ${VMAF_LIBRARY_DIRS} ${VMAF_LIBRARIES}.

Change-Id: I2a42a6c34d69452a42d936138f0971eeb956b0ed
---
 CMakeLists.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47f2a8ccf9..1aa87f27fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -572,13 +572,13 @@ if(CONFIG_AV1_ENCODER)
       target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
     else()
       pkg_check_modules(LIBJXL REQUIRED libjxl)
-      target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS} ${LIBJXL_LIBRARIES})
+      target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS})
       target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
       if(LIBJXL_CFLAGS)
         append_compiler_flag("${LIBJXL_CFLAGS}")
       endif()
       pkg_check_modules(LIBHWY REQUIRED libhwy)
-      target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS} ${LIBHWY_LIBRARIES})
+      target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS})
       target_include_directories(aom_dsp_encoder
                                  PRIVATE ${LIBLIBHWY_INCLUDE_DIRS})
       if(LIBHWY_CFLAGS)
@@ -644,10 +644,9 @@ if(CONFIG_AV1_ENCODER)
     if(PKG_CONFIG_FOUND)
       pkg_check_modules(VMAF REQUIRED libvmaf)
       if(BUILD_SHARED_LIBS)
-        target_link_libraries(aom_static
-                              PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+        target_link_libraries(aom_static PRIVATE ${VMAF_LDFLAGS})
       endif()
-      target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+      target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS})
       target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
       if(VMAF_CFLAGS)
         foreach(flag "${VMAF_CFLAGS}")
-- 
GitLab


From 5632ebed19fc472dae0cbe26a89552af0ee814dd Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 16 Jul 2024 15:40:50 -0700
Subject: [PATCH 296/391] Fix a copy-paste error in get_partition()

Fix Coverity defect CID 329379:
  original: sshigh * 2 == bhigh looks like the original copy.

  CID 329379: (#1 of 1): Copy-paste error (COPY_PASTE_ERROR)
  copy_paste_error: bhigh in sswide * 2 == bhigh looks like a
  copy-paste error.

  Should it say bwide instead?

This bug was introduced in
https://aomedia-review.googlesource.com/c/aom/+/20901.

Change-Id: I289fd7bb8bfe3856d6ab2a5bcb10faadb8ff9cdd
---
 av1/common/av1_common_int.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 17afbf40dd..2d4ff64042 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -1802,7 +1802,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
       // PARTITION_VERT_B. To distinguish the latter two, check if the right
       // half was split.
       if (sswide * 4 == bwide) return PARTITION_VERT_4;
-      assert(sswide * 2 == bhigh);
+      assert(sswide * 2 == bwide);
 
       if (mbmi_right->bsize == subsize)
         return PARTITION_VERT;
-- 
GitLab


From 73c232fc7225d290074d35217d2fd0f32e130685 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 30 Jul 2024 18:05:43 -0700
Subject: [PATCH 297/391] Fix GCC 13.2.0 -Wignored-attributes warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the compiler warning:
  aom/tools/dump_obu.cc:115:58: warning: ignoring attributes on template
  argument â€˜int (*)(FILE*)â€™ [-Wignored-attributes]
    115 |   using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
        |                                                          ^

Port the custom deleter example in
https://en.cppreference.com/w/cpp/memory/unique_ptr.

Change-Id: I389d42b7e1fcfe00855b813de2a86da07c4a8ead
---
 tools/dump_obu.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/dump_obu.cc b/tools/dump_obu.cc
index f1f6c62309..75e2b84932 100644
--- a/tools/dump_obu.cc
+++ b/tools/dump_obu.cc
@@ -101,6 +101,8 @@ bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
   return true;
 }
 
+void CloseFile(FILE *stream) { fclose(stream); }
+
 }  // namespace
 
 int main(int argc, const char *argv[]) {
@@ -112,8 +114,8 @@ int main(int argc, const char *argv[]) {
 
   const std::string filename = argv[1];
 
-  using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
-  FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose);
+  using FilePtr = std::unique_ptr<FILE, decltype(&CloseFile)>;
+  FilePtr input_file(fopen(filename.c_str(), "rb"), &CloseFile);
   if (input_file.get() == nullptr) {
     input_file.release();
     fprintf(stderr, "Error: Cannot open input file.\n");
-- 
GitLab


From 966205d22fe463e220c539f352a56339025840f5 Mon Sep 17 00:00:00 2001
From: Denis Nikitin <denik@google.com>
Date: Mon, 29 Jul 2024 18:17:32 -0700
Subject: [PATCH 298/391] Reduce stack size on Arm NEON

av1 selfguided restoration on neon uses a couple of large temporary
arrays with the total size up to 95kB on stack which may cause
stack overflow with a small default thread stack size, for example
64KB or 128KB.

Allocate large buffers on the heap like it is done on x86.

BUG=aomedia:355499165

Change-Id: I0f98693928d2b4f4e750a4adef7137a8b774d6cd
---
 av1/common/arm/selfguided_neon.c | 95 +++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index e613ecb98e..8597d2426c 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -1174,23 +1174,27 @@ static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
   } while (h > 0);
 }
 
-static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
-                                             int height, int dgd_stride,
-                                             int32_t *dst, int dst_stride,
-                                             int bit_depth, int sgr_params_idx,
-                                             int radius_idx) {
+static INLINE int restoration_fast_internal(uint16_t *dgd16, int width,
+                                            int height, int dgd_stride,
+                                            int32_t *dst, int dst_stride,
+                                            int bit_depth, int sgr_params_idx,
+                                            int radius_idx) {
   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-
   const int buf_stride = ((width_ext + 3) & ~3) + 16;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *square_sum_buf = A_;
-  int32_t *sum_buf = B_;
-  uint16_t *tmp16_buf = A16_;
+
+  const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
+  int32_t *buf = aom_memalign(8, buf_size);
+  if (!buf) return -1;
+
+  int32_t *square_sum_buf = buf;
+  int32_t *sum_buf = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
+  uint16_t *tmp16_buf = (uint16_t *)(sum_buf + RESTORATION_PROC_UNIT_PELS);
+  assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
+             (char *)buf + buf_size &&
+         "Allocated buffer is too small. Resize the buffer.");
 
   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
@@ -1236,26 +1240,32 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
 #endif
   final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
                              dgd_stride, dst, dst_stride, width, height);
+  aom_free(buf);
+  return 0;
 }
 
-static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
-                                        int dgd_stride, int32_t *dst,
-                                        int dst_stride, int bit_depth,
-                                        int sgr_params_idx, int radius_idx) {
+static INLINE int restoration_internal(uint16_t *dgd16, int width, int height,
+                                       int dgd_stride, int32_t *dst,
+                                       int dst_stride, int bit_depth,
+                                       int sgr_params_idx, int radius_idx) {
   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+  const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
+  int32_t *buf = aom_memalign(8, buf_size);
+  if (!buf) return -1;
 
-  int buf_stride = ((width_ext + 3) & ~3) + 16;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
-  uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *square_sum_buf = A_;
-  uint16_t *sum_buf = B16_;
-  uint16_t *A16 = A16_;
-  int32_t *B = B_;
+  int32_t *square_sum_buf = buf;
+  int32_t *B = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
+  uint16_t *A16 = (uint16_t *)(B + RESTORATION_PROC_UNIT_PELS);
+  uint16_t *sum_buf = A16 + RESTORATION_PROC_UNIT_PELS;
+
+  assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
+             (char *)buf + buf_size &&
+         "Allocated buffer is too small. Resize the buffer.");
 
   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
@@ -1300,6 +1310,8 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
 #endif
   final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
                         dst_stride, width, height);
+  aom_free(buf);
+  return 0;
 }
 
 static INLINE void src_convert_u8_to_u16(const uint8_t *src,
@@ -1440,12 +1452,17 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
       dgd16_stride, width_ext, height_ext);
 #endif
 
-  if (params->r[0] > 0)
-    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
-                              flt_stride, bit_depth, sgr_params_idx, 0);
-  if (params->r[1] > 0)
-    restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
-                         bit_depth, sgr_params_idx, 1);
+  if (params->r[0] > 0) {
+    int ret =
+        restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+                                  flt_stride, bit_depth, sgr_params_idx, 0);
+    if (ret != 0) return ret;
+  }
+  if (params->r[1] > 0) {
+    int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
+                                   flt_stride, bit_depth, sgr_params_idx, 1);
+    if (ret != 0) return ret;
+  }
   return 0;
 }
 
@@ -1491,12 +1508,16 @@ int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
       dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
       dgd16_stride, width_ext, height_ext);
 #endif
-  if (params->r[0] > 0)
-    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
-                              bit_depth, eps, 0);
-  if (params->r[1] > 0)
-    restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
-                         bit_depth, eps, 1);
+  if (params->r[0] > 0) {
+    int ret = restoration_fast_internal(dgd16, width, height, dgd16_stride,
+                                        flt0, width, bit_depth, eps, 0);
+    if (ret != 0) return ret;
+  }
+  if (params->r[1] > 0) {
+    int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
+                                   width, bit_depth, eps, 1);
+    if (ret != 0) return ret;
+  }
 
   av1_decode_xq(xqd, xq, params);
 
-- 
GitLab


From 52de8996066ff8b3130aa53ae4ce1527254b930b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 06:02:22 -0700
Subject: [PATCH 299/391] Remove third_party/googletest/src/googlemock

No tests are using GoogleMock now, so it can be removed.

Change-Id: I8000f5fe428a4df828a53715772059088f28d803
---
 test/test.cmake                               |   10 -
 third_party/googletest/README.libaom          |    6 -
 .../googletest/src/googlemock/CMakeLists.txt  |  218 -
 .../googletest/src/googlemock/README.md       |   40 -
 .../src/googlemock/cmake/gmock.pc.in          |   10 -
 .../src/googlemock/cmake/gmock_main.pc.in     |   10 -
 .../googlemock/include/gmock/gmock-actions.h  | 2298 -------
 .../include/gmock/gmock-cardinalities.h       |  159 -
 .../include/gmock/gmock-function-mocker.h     |  514 --
 .../googlemock/include/gmock/gmock-matchers.h | 5610 -----------------
 .../include/gmock/gmock-more-actions.h        |  662 --
 .../include/gmock/gmock-more-matchers.h       |   91 -
 .../include/gmock/gmock-nice-strict.h         |  277 -
 .../include/gmock/gmock-spec-builders.h       | 2083 ------
 .../src/googlemock/include/gmock/gmock.h      |   96 -
 .../include/gmock/internal/custom/README.md   |   18 -
 .../internal/custom/gmock-generated-actions.h |    7 -
 .../gmock/internal/custom/gmock-matchers.h    |   37 -
 .../gmock/internal/custom/gmock-port.h        |   40 -
 .../gmock/internal/gmock-internal-utils.h     |  476 --
 .../include/gmock/internal/gmock-port.h       |  139 -
 .../include/gmock/internal/gmock-pp.h         |  279 -
 .../src/googlemock/src/gmock-all.cc           |   46 -
 .../src/googlemock/src/gmock-cardinalities.cc |  155 -
 .../googlemock/src/gmock-internal-utils.cc    |  250 -
 .../src/googlemock/src/gmock-matchers.cc      |  462 --
 .../src/googlemock/src/gmock-spec-builders.cc |  781 ---
 .../googletest/src/googlemock/src/gmock.cc    |  223 -
 .../src/googlemock/src/gmock_main.cc          |   72 -
 29 files changed, 15069 deletions(-)
 delete mode 100644 third_party/googletest/src/googlemock/CMakeLists.txt
 delete mode 100644 third_party/googletest/src/googlemock/README.md
 delete mode 100644 third_party/googletest/src/googlemock/cmake/gmock.pc.in
 delete mode 100644 third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/gmock.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
 delete mode 100644 third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock-all.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock-matchers.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock.cc
 delete mode 100644 third_party/googletest/src/googlemock/src/gmock_main.cc

diff --git a/test/test.cmake b/test/test.cmake
index a3e0d6abec..f697db1c9c 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -419,16 +419,6 @@ if(ENABLE_TESTS)
       target_compile_definitions(aom_gtest PUBLIC GTEST_HAS_PTHREAD=0)
     endif()
   endif()
-
-  add_library(
-    aom_gmock STATIC
-    "${AOM_ROOT}/third_party/googletest/src/googlemock/src/gmock-all.cc")
-  set_property(TARGET aom_gmock PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
-  target_include_directories(
-    aom_gmock
-    PUBLIC "${AOM_ROOT}/third_party/googletest/src/googlemock/include"
-    PRIVATE "${AOM_ROOT}/third_party/googletest/src/googlemock")
-  target_link_libraries(aom_gmock ${AOM_LIB_LINK_TYPE} aom_gtest)
 endif()
 
 # Setup testdata download targets, test build targets, and test run targets. The
diff --git a/third_party/googletest/README.libaom b/third_party/googletest/README.libaom
index 5e429d4dae..3b21c8efca 100644
--- a/third_party/googletest/README.libaom
+++ b/third_party/googletest/README.libaom
@@ -16,12 +16,6 @@ Local Modifications:
   .clang-format
   CMakeLists.txt
   CONTRIBUTORS
-  googlemock/
-   cmake
-   CMakeLists.txt
-   include
-   README.md
-   src
   googletest/
    cmake
    CMakeLists.txt
diff --git a/third_party/googletest/src/googlemock/CMakeLists.txt b/third_party/googletest/src/googlemock/CMakeLists.txt
deleted file mode 100644
index 5c1f0dafea..0000000000
--- a/third_party/googletest/src/googlemock/CMakeLists.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-########################################################################
-# Note: CMake support is community-based. The maintainers do not use CMake
-# internally.
-#
-# CMake build script for Google Mock.
-#
-# To run the tests for Google Mock itself on Linux, use 'make test' or
-# ctest.  You can select which tests to run using 'ctest -R regex'.
-# For more options, run 'ctest --help'.
-
-option(gmock_build_tests "Build all of Google Mock's own tests." OFF)
-
-# A directory to find Google Test sources.
-if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest/CMakeLists.txt")
-  set(gtest_dir gtest)
-else()
-  set(gtest_dir ../googletest)
-endif()
-
-# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
-include("${gtest_dir}/cmake/hermetic_build.cmake" OPTIONAL)
-
-if (COMMAND pre_project_set_up_hermetic_build)
-  # Google Test also calls hermetic setup functions from add_subdirectory,
-  # although its changes will not affect things at the current scope.
-  pre_project_set_up_hermetic_build()
-endif()
-
-########################################################################
-#
-# Project-wide settings
-
-# Name of the project.
-#
-# CMake files in this project can refer to the root source directory
-# as ${gmock_SOURCE_DIR} and to the root binary directory as
-# ${gmock_BINARY_DIR}.
-# Language "C" is required for find_package(Threads).
-cmake_minimum_required(VERSION 3.5)
-cmake_policy(SET CMP0048 NEW)
-project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
-
-if (COMMAND set_up_hermetic_build)
-  set_up_hermetic_build()
-endif()
-
-# Instructs CMake to process Google Test's CMakeLists.txt and add its
-# targets to the current scope.  We are placing Google Test's binary
-# directory in a subdirectory of our own as VC compilation may break
-# if they are the same (the default).
-add_subdirectory("${gtest_dir}" "${gmock_BINARY_DIR}/${gtest_dir}")
-
-
-# These commands only run if this is the main project
-if(CMAKE_PROJECT_NAME STREQUAL "gmock" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
-  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
-  # make it prominent in the GUI.
-  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
-else()
-  mark_as_advanced(gmock_build_tests)
-endif()
-
-# Although Google Test's CMakeLists.txt calls this function, the
-# changes there don't affect the current scope.  Therefore we have to
-# call it again here.
-config_compiler_and_linker()  # from ${gtest_dir}/cmake/internal_utils.cmake
-
-# Adds Google Mock's and Google Test's header directories to the search path.
-set(gmock_build_include_dirs
-  "${gmock_SOURCE_DIR}/include"
-  "${gmock_SOURCE_DIR}"
-  "${gtest_SOURCE_DIR}/include"
-  # This directory is needed to build directly from Google Test sources.
-  "${gtest_SOURCE_DIR}")
-include_directories(${gmock_build_include_dirs})
-
-########################################################################
-#
-# Defines the gmock & gmock_main libraries.  User tests should link
-# with one of them.
-
-# Google Mock libraries.  We build them using more strict warnings than what
-# are used for other targets, to ensure that Google Mock can be compiled by
-# a user aggressive about warnings.
-if (MSVC)
-  cxx_library(gmock
-              "${cxx_strict}"
-              "${gtest_dir}/src/gtest-all.cc"
-              src/gmock-all.cc)
-
-  cxx_library(gmock_main
-              "${cxx_strict}"
-              "${gtest_dir}/src/gtest-all.cc"
-              src/gmock-all.cc
-              src/gmock_main.cc)
-else()
-  cxx_library(gmock "${cxx_strict}" src/gmock-all.cc)
-  target_link_libraries(gmock PUBLIC gtest)
-  set_target_properties(gmock PROPERTIES VERSION ${GOOGLETEST_VERSION})
-  cxx_library(gmock_main "${cxx_strict}" src/gmock_main.cc)
-  target_link_libraries(gmock_main PUBLIC gmock)
-  set_target_properties(gmock_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
-endif()
-# If the CMake version supports it, attach header directory information
-# to the targets for when we are part of a parent build (ie being pulled
-# in via add_subdirectory() rather than being a standalone build).
-if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
-  string(REPLACE ";" "$<SEMICOLON>" dirs "${gmock_build_include_dirs}")
-  target_include_directories(gmock SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${dirs}>"
-    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
-  target_include_directories(gmock_main SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${dirs}>"
-    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
-endif()
-
-########################################################################
-#
-# Install rules
-install_project(gmock gmock_main)
-
-########################################################################
-#
-# Google Mock's own tests.
-#
-# You can skip this section if you aren't interested in testing
-# Google Mock itself.
-#
-# The tests are not built by default.  To build them, set the
-# gmock_build_tests option to ON.  You can do it by running ccmake
-# or specifying the -Dgmock_build_tests=ON flag when running cmake.
-
-if (gmock_build_tests)
-  # This must be set in the root directory for the tests to be run by
-  # 'make test' or ctest.
-  enable_testing()
-
-  if (MINGW OR CYGWIN)
-    if (CMAKE_VERSION VERSION_LESS "2.8.12")
-      add_compile_options("-Wa,-mbig-obj")
-    else()
-      add_definitions("-Wa,-mbig-obj")
-    endif()
-  endif()
-
-  ############################################################
-  # C++ tests built with standard compiler flags.
-
-  cxx_test(gmock-actions_test gmock_main)
-  cxx_test(gmock-cardinalities_test gmock_main)
-  cxx_test(gmock_ex_test gmock_main)
-  cxx_test(gmock-function-mocker_test gmock_main)
-  cxx_test(gmock-internal-utils_test gmock_main)
-  cxx_test(gmock-matchers-arithmetic_test gmock_main)
-  cxx_test(gmock-matchers-comparisons_test gmock_main)
-  cxx_test(gmock-matchers-containers_test gmock_main)
-  cxx_test(gmock-matchers-misc_test gmock_main)
-  cxx_test(gmock-more-actions_test gmock_main)
-  cxx_test(gmock-nice-strict_test gmock_main)
-  cxx_test(gmock-port_test gmock_main)
-  cxx_test(gmock-spec-builders_test gmock_main)
-  cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc)
-  cxx_test(gmock_test gmock_main)
-
-  if (DEFINED GTEST_HAS_PTHREAD)
-    cxx_test(gmock_stress_test gmock)
-  endif()
-
-  # gmock_all_test is commented to save time building and running tests.
-  # Uncomment if necessary.
-  # cxx_test(gmock_all_test gmock_main)
-
-  ############################################################
-  # C++ tests built with non-standard compiler flags.
-
-  if (MSVC)
-    cxx_library(gmock_main_no_exception "${cxx_no_exception}"
-      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
-
-    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}"
-      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
-
-  else()
-    cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc)
-    target_link_libraries(gmock_main_no_exception PUBLIC gmock)
-
-    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc)
-    target_link_libraries(gmock_main_no_rtti PUBLIC gmock)
-  endif()
-  cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}"
-    gmock_main_no_exception test/gmock-more-actions_test.cc)
-
-  cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}"
-    gmock_main_no_rtti test/gmock-spec-builders_test.cc)
-
-  cxx_shared_library(shared_gmock_main "${cxx_default}"
-    "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
-
-  # Tests that a binary can be built with Google Mock as a shared library.  On
-  # some system configurations, it may not possible to run the binary without
-  # knowing more details about the system configurations. We do not try to run
-  # this binary. To get a more robust shared library coverage, configure with
-  # -DBUILD_SHARED_LIBS=ON.
-  cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}"
-    shared_gmock_main test/gmock-spec-builders_test.cc)
-  set_target_properties(shared_gmock_test_
-    PROPERTIES
-    COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
-
-  ############################################################
-  # Python tests.
-
-  cxx_executable(gmock_leak_test_ test gmock_main)
-  py_test(gmock_leak_test)
-
-  cxx_executable(gmock_output_test_ test gmock)
-  py_test(gmock_output_test)
-endif()
diff --git a/third_party/googletest/src/googlemock/README.md b/third_party/googletest/src/googlemock/README.md
deleted file mode 100644
index 7da60655db..0000000000
--- a/third_party/googletest/src/googlemock/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Googletest Mocking (gMock) Framework
-
-### Overview
-
-Google's framework for writing and using C++ mock classes. It can help you
-derive better designs of your system and write better tests.
-
-It is inspired by:
-
-*   [jMock](http://www.jmock.org/)
-*   [EasyMock](http://www.easymock.org/)
-*   [Hamcrest](http://code.google.com/p/hamcrest/)
-
-It is designed with C++'s specifics in mind.
-
-gMock:
-
--   Provides a declarative syntax for defining mocks.
--   Can define partial (hybrid) mocks, which are a cross of real and mock
-    objects.
--   Handles functions of arbitrary types and overloaded functions.
--   Comes with a rich set of matchers for validating function arguments.
--   Uses an intuitive syntax for controlling the behavior of a mock.
--   Does automatic verification of expectations (no record-and-replay needed).
--   Allows arbitrary (partial) ordering constraints on function calls to be
-    expressed.
--   Lets a user extend it by defining new matchers and actions.
--   Does not use exceptions.
--   Is easy to learn and use.
-
-Details and examples can be found here:
-
-*   [gMock for Dummies](https://google.github.io/googletest/gmock_for_dummies.html)
-*   [Legacy gMock FAQ](https://google.github.io/googletest/gmock_faq.html)
-*   [gMock Cookbook](https://google.github.io/googletest/gmock_cook_book.html)
-*   [gMock Cheat Sheet](https://google.github.io/googletest/gmock_cheat_sheet.html)
-
-GoogleMock is a part of
-[GoogleTest C++ testing framework](http://github.com/google/googletest/) and a
-subject to the same requirements.
diff --git a/third_party/googletest/src/googlemock/cmake/gmock.pc.in b/third_party/googletest/src/googlemock/cmake/gmock.pc.in
deleted file mode 100644
index 23c67b5c88..0000000000
--- a/third_party/googletest/src/googlemock/cmake/gmock.pc.in
+++ /dev/null
@@ -1,10 +0,0 @@
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
-includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
-
-Name: gmock
-Description: GoogleMock (without main() function)
-Version: @PROJECT_VERSION@
-URL: https://github.com/google/googletest
-Requires: gtest = @PROJECT_VERSION@
-Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in b/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
deleted file mode 100644
index 66ffea7f44..0000000000
--- a/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
+++ /dev/null
@@ -1,10 +0,0 @@
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
-includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
-
-Name: gmock_main
-Description: GoogleMock (with main() function)
-Version: @PROJECT_VERSION@
-URL: https://github.com/google/googletest
-Requires: gmock = @PROJECT_VERSION@
-Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h b/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
deleted file mode 100644
index c785ad8abb..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
+++ /dev/null
@@ -1,2298 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// The ACTION* family of macros can be used in a namespace scope to
-// define custom actions easily.  The syntax:
-//
-//   ACTION(name) { statements; }
-//
-// will define an action with the given name that executes the
-// statements.  The value returned by the statements will be used as
-// the return value of the action.  Inside the statements, you can
-// refer to the K-th (0-based) argument of the mock function by
-// 'argK', and refer to its type by 'argK_type'.  For example:
-//
-//   ACTION(IncrementArg1) {
-//     arg1_type temp = arg1;
-//     return ++(*temp);
-//   }
-//
-// allows you to write
-//
-//   ...WillOnce(IncrementArg1());
-//
-// You can also refer to the entire argument tuple and its type by
-// 'args' and 'args_type', and refer to the mock function type and its
-// return type by 'function_type' and 'return_type'.
-//
-// Note that you don't need to specify the types of the mock function
-// arguments.  However rest assured that your code is still type-safe:
-// you'll get a compiler error if *arg1 doesn't support the ++
-// operator, or if the type of ++(*arg1) isn't compatible with the
-// mock function's return type, for example.
-//
-// Sometimes you'll want to parameterize the action.   For that you can use
-// another macro:
-//
-//   ACTION_P(name, param_name) { statements; }
-//
-// For example:
-//
-//   ACTION_P(Add, n) { return arg0 + n; }
-//
-// will allow you to write:
-//
-//   ...WillOnce(Add(5));
-//
-// Note that you don't need to provide the type of the parameter
-// either.  If you need to reference the type of a parameter named
-// 'foo', you can write 'foo_type'.  For example, in the body of
-// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
-// of 'n'.
-//
-// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
-// multi-parameter actions.
-//
-// For the purpose of typing, you can view
-//
-//   ACTION_Pk(Foo, p1, ..., pk) { ... }
-//
-// as shorthand for
-//
-//   template <typename p1_type, ..., typename pk_type>
-//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
-//
-// In particular, you can provide the template type arguments
-// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
-// although usually you can rely on the compiler to infer the types
-// for you automatically.  You can assign the result of expression
-// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
-// pk_type>.  This can be useful when composing actions.
-//
-// You can also overload actions with different numbers of parameters:
-//
-//   ACTION_P(Plus, a) { ... }
-//   ACTION_P2(Plus, a, b) { ... }
-//
-// While it's tempting to always use the ACTION* macros when defining
-// a new action, you should also consider implementing ActionInterface
-// or using MakePolymorphicAction() instead, especially if you need to
-// use the action a lot.  While these approaches require more work,
-// they give you more control on the types of the mock function
-// arguments and the action parameters, which in general leads to
-// better compiler error messages that pay off in the long run.  They
-// also allow overloading actions based on parameter types (as opposed
-// to just based on the number of parameters).
-//
-// CAVEAT:
-//
-// ACTION*() can only be used in a namespace scope as templates cannot be
-// declared inside of a local class.
-// Users can, however, define any local functors (e.g. a lambda) that
-// can be used as actions.
-//
-// MORE INFORMATION:
-//
-// To learn more about using these macros, please search for 'ACTION' on
-// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
-
-#ifndef _WIN32_WCE
-#include <errno.h>
-#endif
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gmock/internal/gmock-port.h"
-#include "gmock/internal/gmock-pp.h"
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#endif
-
-namespace testing {
-
-// To implement an action Foo, define:
-//   1. a class FooAction that implements the ActionInterface interface, and
-//   2. a factory function that creates an Action object from a
-//      const FooAction*.
-//
-// The two-level delegation design follows that of Matcher, providing
-// consistency for extension developers.  It also eases ownership
-// management as Action objects can now be copied like plain values.
-
-namespace internal {
-
-// BuiltInDefaultValueGetter<T, true>::Get() returns a
-// default-constructed T value.  BuiltInDefaultValueGetter<T,
-// false>::Get() crashes with an error.
-//
-// This primary template is used when kDefaultConstructible is true.
-template <typename T, bool kDefaultConstructible>
-struct BuiltInDefaultValueGetter {
-  static T Get() { return T(); }
-};
-template <typename T>
-struct BuiltInDefaultValueGetter<T, false> {
-  static T Get() {
-    Assert(false, __FILE__, __LINE__,
-           "Default action undefined for the function return type.");
-    return internal::Invalid<T>();
-    // The above statement will never be reached, but is required in
-    // order for this function to compile.
-  }
-};
-
-// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
-// for type T, which is NULL when T is a raw pointer type, 0 when T is
-// a numeric type, false when T is bool, or "" when T is string or
-// std::string.  In addition, in C++11 and above, it turns a
-// default-constructed T value if T is default constructible.  For any
-// other type T, the built-in default T value is undefined, and the
-// function will abort the process.
-template <typename T>
-class BuiltInDefaultValue {
- public:
-  // This function returns true if and only if type T has a built-in default
-  // value.
-  static bool Exists() { return ::std::is_default_constructible<T>::value; }
-
-  static T Get() {
-    return BuiltInDefaultValueGetter<
-        T, ::std::is_default_constructible<T>::value>::Get();
-  }
-};
-
-// This partial specialization says that we use the same built-in
-// default value for T and const T.
-template <typename T>
-class BuiltInDefaultValue<const T> {
- public:
-  static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
-  static T Get() { return BuiltInDefaultValue<T>::Get(); }
-};
-
-// This partial specialization defines the default values for pointer
-// types.
-template <typename T>
-class BuiltInDefaultValue<T*> {
- public:
-  static bool Exists() { return true; }
-  static T* Get() { return nullptr; }
-};
-
-// The following specializations define the default values for
-// specific types we care about.
-#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
-  template <>                                                     \
-  class BuiltInDefaultValue<type> {                               \
-   public:                                                        \
-    static bool Exists() { return true; }                         \
-    static type Get() { return value; }                           \
-  }
-
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
-
-// There's no need for a default action for signed wchar_t, as that
-// type is the same as wchar_t for gcc, and invalid for MSVC.
-//
-// There's also no need for a default action for unsigned wchar_t, as
-// that type is the same as unsigned int for gcc, and invalid for
-// MSVC.
-#if GMOCK_WCHAR_T_IS_NATIVE_
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U);  // NOLINT
-#endif
-
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);     // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);        // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0);  // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0);    // NOLINT
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
-GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
-
-#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
-
-// Partial implementations of metaprogramming types from the standard library
-// not available in C++11.
-
-template <typename P>
-struct negation
-    // NOLINTNEXTLINE
-    : std::integral_constant<bool, bool(!P::value)> {};
-
-// Base case: with zero predicates the answer is always true.
-template <typename...>
-struct conjunction : std::true_type {};
-
-// With a single predicate, the answer is that predicate.
-template <typename P1>
-struct conjunction<P1> : P1 {};
-
-// With multiple predicates the answer is the first predicate if that is false,
-// and we recurse otherwise.
-template <typename P1, typename... Ps>
-struct conjunction<P1, Ps...>
-    : std::conditional<bool(P1::value), conjunction<Ps...>, P1>::type {};
-
-template <typename...>
-struct disjunction : std::false_type {};
-
-template <typename P1>
-struct disjunction<P1> : P1 {};
-
-template <typename P1, typename... Ps>
-struct disjunction<P1, Ps...>
-    // NOLINTNEXTLINE
-    : std::conditional<!bool(P1::value), disjunction<Ps...>, P1>::type {};
-
-template <typename...>
-using void_t = void;
-
-// Detects whether an expression of type `From` can be implicitly converted to
-// `To` according to [conv]. In C++17, [conv]/3 defines this as follows:
-//
-//     An expression e can be implicitly converted to a type T if and only if
-//     the declaration T t=e; is well-formed, for some invented temporary
-//     variable t ([dcl.init]).
-//
-// [conv]/2 implies we can use function argument passing to detect whether this
-// initialization is valid.
-//
-// Note that this is distinct from is_convertible, which requires this be valid:
-//
-//     To test() {
-//       return declval<From>();
-//     }
-//
-// In particular, is_convertible doesn't give the correct answer when `To` and
-// `From` are the same non-moveable type since `declval<From>` will be an rvalue
-// reference, defeating the guaranteed copy elision that would otherwise make
-// this function work.
-//
-// REQUIRES: `From` is not cv void.
-template <typename From, typename To>
-struct is_implicitly_convertible {
- private:
-  // A function that accepts a parameter of type T. This can be called with type
-  // U successfully only if U is implicitly convertible to T.
-  template <typename T>
-  static void Accept(T);
-
-  // A function that creates a value of type T.
-  template <typename T>
-  static T Make();
-
-  // An overload be selected when implicit conversion from T to To is possible.
-  template <typename T, typename = decltype(Accept<To>(Make<T>()))>
-  static std::true_type TestImplicitConversion(int);
-
-  // A fallback overload selected in all other cases.
-  template <typename T>
-  static std::false_type TestImplicitConversion(...);
-
- public:
-  using type = decltype(TestImplicitConversion<From>(0));
-  static constexpr bool value = type::value;
-};
-
-// Like std::invoke_result_t from C++17, but works only for objects with call
-// operators (not e.g. member function pointers, which we don't need specific
-// support for in OnceAction because std::function deals with them).
-template <typename F, typename... Args>
-using call_result_t = decltype(std::declval<F>()(std::declval<Args>()...));
-
-template <typename Void, typename R, typename F, typename... Args>
-struct is_callable_r_impl : std::false_type {};
-
-// Specialize the struct for those template arguments where call_result_t is
-// well-formed. When it's not, the generic template above is chosen, resulting
-// in std::false_type.
-template <typename R, typename F, typename... Args>
-struct is_callable_r_impl<void_t<call_result_t<F, Args...>>, R, F, Args...>
-    : std::conditional<
-          std::is_void<R>::value,  //
-          std::true_type,          //
-          is_implicitly_convertible<call_result_t<F, Args...>, R>>::type {};
-
-// Like std::is_invocable_r from C++17, but works only for objects with call
-// operators. See the note on call_result_t.
-template <typename R, typename F, typename... Args>
-using is_callable_r = is_callable_r_impl<void, R, F, Args...>;
-
-// Like std::as_const from C++17.
-template <typename T>
-typename std::add_const<T>::type& as_const(T& t) {
-  return t;
-}
-
-}  // namespace internal
-
-// Specialized for function types below.
-template <typename F>
-class OnceAction;
-
-// An action that can only be used once.
-//
-// This is accepted by WillOnce, which doesn't require the underlying action to
-// be copy-constructible (only move-constructible), and promises to invoke it as
-// an rvalue reference. This allows the action to work with move-only types like
-// std::move_only_function in a type-safe manner.
-//
-// For example:
-//
-//     // Assume we have some API that needs to accept a unique pointer to some
-//     // non-copyable object Foo.
-//     void AcceptUniquePointer(std::unique_ptr<Foo> foo);
-//
-//     // We can define an action that provides a Foo to that API. Because It
-//     // has to give away its unique pointer, it must not be called more than
-//     // once, so its call operator is &&-qualified.
-//     struct ProvideFoo {
-//       std::unique_ptr<Foo> foo;
-//
-//       void operator()() && {
-//         AcceptUniquePointer(std::move(Foo));
-//       }
-//     };
-//
-//     // This action can be used with WillOnce.
-//     EXPECT_CALL(mock, Call)
-//         .WillOnce(ProvideFoo{std::make_unique<Foo>(...)});
-//
-//     // But a call to WillRepeatedly will fail to compile. This is correct,
-//     // since the action cannot correctly be used repeatedly.
-//     EXPECT_CALL(mock, Call)
-//         .WillRepeatedly(ProvideFoo{std::make_unique<Foo>(...)});
-//
-// A less-contrived example would be an action that returns an arbitrary type,
-// whose &&-qualified call operator is capable of dealing with move-only types.
-template <typename Result, typename... Args>
-class OnceAction<Result(Args...)> final {
- private:
-  // True iff we can use the given callable type (or lvalue reference) directly
-  // via StdFunctionAdaptor.
-  template <typename Callable>
-  using IsDirectlyCompatible = internal::conjunction<
-      // It must be possible to capture the callable in StdFunctionAdaptor.
-      std::is_constructible<typename std::decay<Callable>::type, Callable>,
-      // The callable must be compatible with our signature.
-      internal::is_callable_r<Result, typename std::decay<Callable>::type,
-                              Args...>>;
-
-  // True iff we can use the given callable type via StdFunctionAdaptor once we
-  // ignore incoming arguments.
-  template <typename Callable>
-  using IsCompatibleAfterIgnoringArguments = internal::conjunction<
-      // It must be possible to capture the callable in a lambda.
-      std::is_constructible<typename std::decay<Callable>::type, Callable>,
-      // The callable must be invocable with zero arguments, returning something
-      // convertible to Result.
-      internal::is_callable_r<Result, typename std::decay<Callable>::type>>;
-
- public:
-  // Construct from a callable that is directly compatible with our mocked
-  // signature: it accepts our function type's arguments and returns something
-  // convertible to our result type.
-  template <typename Callable,
-            typename std::enable_if<
-                internal::conjunction<
-                    // Teach clang on macOS that we're not talking about a
-                    // copy/move constructor here. Otherwise it gets confused
-                    // when checking the is_constructible requirement of our
-                    // traits above.
-                    internal::negation<std::is_same<
-                        OnceAction, typename std::decay<Callable>::type>>,
-                    IsDirectlyCompatible<Callable>>  //
-                ::value,
-                int>::type = 0>
-  OnceAction(Callable&& callable)  // NOLINT
-      : function_(StdFunctionAdaptor<typename std::decay<Callable>::type>(
-            {}, std::forward<Callable>(callable))) {}
-
-  // As above, but for a callable that ignores the mocked function's arguments.
-  template <typename Callable,
-            typename std::enable_if<
-                internal::conjunction<
-                    // Teach clang on macOS that we're not talking about a
-                    // copy/move constructor here. Otherwise it gets confused
-                    // when checking the is_constructible requirement of our
-                    // traits above.
-                    internal::negation<std::is_same<
-                        OnceAction, typename std::decay<Callable>::type>>,
-                    // Exclude callables for which the overload above works.
-                    // We'd rather provide the arguments if possible.
-                    internal::negation<IsDirectlyCompatible<Callable>>,
-                    IsCompatibleAfterIgnoringArguments<Callable>>::value,
-                int>::type = 0>
-  OnceAction(Callable&& callable)  // NOLINT
-                                   // Call the constructor above with a callable
-                                   // that ignores the input arguments.
-      : OnceAction(IgnoreIncomingArguments<typename std::decay<Callable>::type>{
-            std::forward<Callable>(callable)}) {}
-
-  // We are naturally copyable because we store only an std::function, but
-  // semantically we should not be copyable.
-  OnceAction(const OnceAction&) = delete;
-  OnceAction& operator=(const OnceAction&) = delete;
-  OnceAction(OnceAction&&) = default;
-
-  // Invoke the underlying action callable with which we were constructed,
-  // handing it the supplied arguments.
-  Result Call(Args... args) && {
-    return function_(std::forward<Args>(args)...);
-  }
-
- private:
-  // An adaptor that wraps a callable that is compatible with our signature and
-  // being invoked as an rvalue reference so that it can be used as an
-  // StdFunctionAdaptor. This throws away type safety, but that's fine because
-  // this is only used by WillOnce, which we know calls at most once.
-  //
-  // Once we have something like std::move_only_function from C++23, we can do
-  // away with this.
-  template <typename Callable>
-  class StdFunctionAdaptor final {
-   public:
-    // A tag indicating that the (otherwise universal) constructor is accepting
-    // the callable itself, instead of e.g. stealing calls for the move
-    // constructor.
-    struct CallableTag final {};
-
-    template <typename F>
-    explicit StdFunctionAdaptor(CallableTag, F&& callable)
-        : callable_(std::make_shared<Callable>(std::forward<F>(callable))) {}
-
-    // Rather than explicitly returning Result, we return whatever the wrapped
-    // callable returns. This allows for compatibility with existing uses like
-    // the following, when the mocked function returns void:
-    //
-    //     EXPECT_CALL(mock_fn_, Call)
-    //         .WillOnce([&] {
-    //            [...]
-    //            return 0;
-    //         });
-    //
-    // Such a callable can be turned into std::function<void()>. If we use an
-    // explicit return type of Result here then it *doesn't* work with
-    // std::function, because we'll get a "void function should not return a
-    // value" error.
-    //
-    // We need not worry about incompatible result types because the SFINAE on
-    // OnceAction already checks this for us. std::is_invocable_r_v itself makes
-    // the same allowance for void result types.
-    template <typename... ArgRefs>
-    internal::call_result_t<Callable, ArgRefs...> operator()(
-        ArgRefs&&... args) const {
-      return std::move(*callable_)(std::forward<ArgRefs>(args)...);
-    }
-
-   private:
-    // We must put the callable on the heap so that we are copyable, which
-    // std::function needs.
-    std::shared_ptr<Callable> callable_;
-  };
-
-  // An adaptor that makes a callable that accepts zero arguments callable with
-  // our mocked arguments.
-  template <typename Callable>
-  struct IgnoreIncomingArguments {
-    internal::call_result_t<Callable> operator()(Args&&...) {
-      return std::move(callable)();
-    }
-
-    Callable callable;
-  };
-
-  std::function<Result(Args...)> function_;
-};
-
-// When an unexpected function call is encountered, Google Mock will
-// let it return a default value if the user has specified one for its
-// return type, or if the return type has a built-in default value;
-// otherwise Google Mock won't know what value to return and will have
-// to abort the process.
-//
-// The DefaultValue<T> class allows a user to specify the
-// default value for a type T that is both copyable and publicly
-// destructible (i.e. anything that can be used as a function return
-// type).  The usage is:
-//
-//   // Sets the default value for type T to be foo.
-//   DefaultValue<T>::Set(foo);
-template <typename T>
-class DefaultValue {
- public:
-  // Sets the default value for type T; requires T to be
-  // copy-constructable and have a public destructor.
-  static void Set(T x) {
-    delete producer_;
-    producer_ = new FixedValueProducer(x);
-  }
-
-  // Provides a factory function to be called to generate the default value.
-  // This method can be used even if T is only move-constructible, but it is not
-  // limited to that case.
-  typedef T (*FactoryFunction)();
-  static void SetFactory(FactoryFunction factory) {
-    delete producer_;
-    producer_ = new FactoryValueProducer(factory);
-  }
-
-  // Unsets the default value for type T.
-  static void Clear() {
-    delete producer_;
-    producer_ = nullptr;
-  }
-
-  // Returns true if and only if the user has set the default value for type T.
-  static bool IsSet() { return producer_ != nullptr; }
-
-  // Returns true if T has a default return value set by the user or there
-  // exists a built-in default value.
-  static bool Exists() {
-    return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
-  }
-
-  // Returns the default value for type T if the user has set one;
-  // otherwise returns the built-in default value. Requires that Exists()
-  // is true, which ensures that the return value is well-defined.
-  static T Get() {
-    return producer_ == nullptr ? internal::BuiltInDefaultValue<T>::Get()
-                                : producer_->Produce();
-  }
-
- private:
-  class ValueProducer {
-   public:
-    virtual ~ValueProducer() {}
-    virtual T Produce() = 0;
-  };
-
-  class FixedValueProducer : public ValueProducer {
-   public:
-    explicit FixedValueProducer(T value) : value_(value) {}
-    T Produce() override { return value_; }
-
-   private:
-    const T value_;
-    FixedValueProducer(const FixedValueProducer&) = delete;
-    FixedValueProducer& operator=(const FixedValueProducer&) = delete;
-  };
-
-  class FactoryValueProducer : public ValueProducer {
-   public:
-    explicit FactoryValueProducer(FactoryFunction factory)
-        : factory_(factory) {}
-    T Produce() override { return factory_(); }
-
-   private:
-    const FactoryFunction factory_;
-    FactoryValueProducer(const FactoryValueProducer&) = delete;
-    FactoryValueProducer& operator=(const FactoryValueProducer&) = delete;
-  };
-
-  static ValueProducer* producer_;
-};
-
-// This partial specialization allows a user to set default values for
-// reference types.
-template <typename T>
-class DefaultValue<T&> {
- public:
-  // Sets the default value for type T&.
-  static void Set(T& x) {  // NOLINT
-    address_ = &x;
-  }
-
-  // Unsets the default value for type T&.
-  static void Clear() { address_ = nullptr; }
-
-  // Returns true if and only if the user has set the default value for type T&.
-  static bool IsSet() { return address_ != nullptr; }
-
-  // Returns true if T has a default return value set by the user or there
-  // exists a built-in default value.
-  static bool Exists() {
-    return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
-  }
-
-  // Returns the default value for type T& if the user has set one;
-  // otherwise returns the built-in default value if there is one;
-  // otherwise aborts the process.
-  static T& Get() {
-    return address_ == nullptr ? internal::BuiltInDefaultValue<T&>::Get()
-                               : *address_;
-  }
-
- private:
-  static T* address_;
-};
-
-// This specialization allows DefaultValue<void>::Get() to
-// compile.
-template <>
-class DefaultValue<void> {
- public:
-  static bool Exists() { return true; }
-  static void Get() {}
-};
-
-// Points to the user-set default value for type T.
-template <typename T>
-typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = nullptr;
-
-// Points to the user-set default value for type T&.
-template <typename T>
-T* DefaultValue<T&>::address_ = nullptr;
-
-// Implement this interface to define an action for function type F.
-template <typename F>
-class ActionInterface {
- public:
-  typedef typename internal::Function<F>::Result Result;
-  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-
-  ActionInterface() {}
-  virtual ~ActionInterface() {}
-
-  // Performs the action.  This method is not const, as in general an
-  // action can have side effects and be stateful.  For example, a
-  // get-the-next-element-from-the-collection action will need to
-  // remember the current element.
-  virtual Result Perform(const ArgumentTuple& args) = 0;
-
- private:
-  ActionInterface(const ActionInterface&) = delete;
-  ActionInterface& operator=(const ActionInterface&) = delete;
-};
-
-template <typename F>
-class Action;
-
-// An Action<R(Args...)> is a copyable and IMMUTABLE (except by assignment)
-// object that represents an action to be taken when a mock function of type
-// R(Args...) is called. The implementation of Action<T> is just a
-// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action! You
-// can view an object implementing ActionInterface<F> as a concrete action
-// (including its current state), and an Action<F> object as a handle to it.
-template <typename R, typename... Args>
-class Action<R(Args...)> {
- private:
-  using F = R(Args...);
-
-  // Adapter class to allow constructing Action from a legacy ActionInterface.
-  // New code should create Actions from functors instead.
-  struct ActionAdapter {
-    // Adapter must be copyable to satisfy std::function requirements.
-    ::std::shared_ptr<ActionInterface<F>> impl_;
-
-    template <typename... InArgs>
-    typename internal::Function<F>::Result operator()(InArgs&&... args) {
-      return impl_->Perform(
-          ::std::forward_as_tuple(::std::forward<InArgs>(args)...));
-    }
-  };
-
-  template <typename G>
-  using IsCompatibleFunctor = std::is_constructible<std::function<F>, G>;
-
- public:
-  typedef typename internal::Function<F>::Result Result;
-  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-
-  // Constructs a null Action.  Needed for storing Action objects in
-  // STL containers.
-  Action() {}
-
-  // Construct an Action from a specified callable.
-  // This cannot take std::function directly, because then Action would not be
-  // directly constructible from lambda (it would require two conversions).
-  template <
-      typename G,
-      typename = typename std::enable_if<internal::disjunction<
-          IsCompatibleFunctor<G>, std::is_constructible<std::function<Result()>,
-                                                        G>>::value>::type>
-  Action(G&& fun) {  // NOLINT
-    Init(::std::forward<G>(fun), IsCompatibleFunctor<G>());
-  }
-
-  // Constructs an Action from its implementation.
-  explicit Action(ActionInterface<F>* impl)
-      : fun_(ActionAdapter{::std::shared_ptr<ActionInterface<F>>(impl)}) {}
-
-  // This constructor allows us to turn an Action<Func> object into an
-  // Action<F>, as long as F's arguments can be implicitly converted
-  // to Func's and Func's return type can be implicitly converted to F's.
-  template <typename Func>
-  Action(const Action<Func>& action)  // NOLINT
-      : fun_(action.fun_) {}
-
-  // Returns true if and only if this is the DoDefault() action.
-  bool IsDoDefault() const { return fun_ == nullptr; }
-
-  // Performs the action.  Note that this method is const even though
-  // the corresponding method in ActionInterface is not.  The reason
-  // is that a const Action<F> means that it cannot be re-bound to
-  // another concrete action, not that the concrete action it binds to
-  // cannot change state.  (Think of the difference between a const
-  // pointer and a pointer to const.)
-  Result Perform(ArgumentTuple args) const {
-    if (IsDoDefault()) {
-      internal::IllegalDoDefault(__FILE__, __LINE__);
-    }
-    return internal::Apply(fun_, ::std::move(args));
-  }
-
-  // An action can be used as a OnceAction, since it's obviously safe to call it
-  // once.
-  operator OnceAction<F>() const {  // NOLINT
-    // Return a OnceAction-compatible callable that calls Perform with the
-    // arguments it is provided. We could instead just return fun_, but then
-    // we'd need to handle the IsDoDefault() case separately.
-    struct OA {
-      Action<F> action;
-
-      R operator()(Args... args) && {
-        return action.Perform(
-            std::forward_as_tuple(std::forward<Args>(args)...));
-      }
-    };
-
-    return OA{*this};
-  }
-
- private:
-  template <typename G>
-  friend class Action;
-
-  template <typename G>
-  void Init(G&& g, ::std::true_type) {
-    fun_ = ::std::forward<G>(g);
-  }
-
-  template <typename G>
-  void Init(G&& g, ::std::false_type) {
-    fun_ = IgnoreArgs<typename ::std::decay<G>::type>{::std::forward<G>(g)};
-  }
-
-  template <typename FunctionImpl>
-  struct IgnoreArgs {
-    template <typename... InArgs>
-    Result operator()(const InArgs&...) const {
-      return function_impl();
-    }
-
-    FunctionImpl function_impl;
-  };
-
-  // fun_ is an empty function if and only if this is the DoDefault() action.
-  ::std::function<F> fun_;
-};
-
-// The PolymorphicAction class template makes it easy to implement a
-// polymorphic action (i.e. an action that can be used in mock
-// functions of than one type, e.g. Return()).
-//
-// To define a polymorphic action, a user first provides a COPYABLE
-// implementation class that has a Perform() method template:
-//
-//   class FooAction {
-//    public:
-//     template <typename Result, typename ArgumentTuple>
-//     Result Perform(const ArgumentTuple& args) const {
-//       // Processes the arguments and returns a result, using
-//       // std::get<N>(args) to get the N-th (0-based) argument in the tuple.
-//     }
-//     ...
-//   };
-//
-// Then the user creates the polymorphic action using
-// MakePolymorphicAction(object) where object has type FooAction.  See
-// the definition of Return(void) and SetArgumentPointee<N>(value) for
-// complete examples.
-template <typename Impl>
-class PolymorphicAction {
- public:
-  explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
-
-  template <typename F>
-  operator Action<F>() const {
-    return Action<F>(new MonomorphicImpl<F>(impl_));
-  }
-
- private:
-  template <typename F>
-  class MonomorphicImpl : public ActionInterface<F> {
-   public:
-    typedef typename internal::Function<F>::Result Result;
-    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-
-    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
-
-    Result Perform(const ArgumentTuple& args) override {
-      return impl_.template Perform<Result>(args);
-    }
-
-   private:
-    Impl impl_;
-  };
-
-  Impl impl_;
-};
-
-// Creates an Action from its implementation and returns it.  The
-// created Action object owns the implementation.
-template <typename F>
-Action<F> MakeAction(ActionInterface<F>* impl) {
-  return Action<F>(impl);
-}
-
-// Creates a polymorphic action from its implementation.  This is
-// easier to use than the PolymorphicAction<Impl> constructor as it
-// doesn't require you to explicitly write the template argument, e.g.
-//
-//   MakePolymorphicAction(foo);
-// vs
-//   PolymorphicAction<TypeOfFoo>(foo);
-template <typename Impl>
-inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
-  return PolymorphicAction<Impl>(impl);
-}
-
-namespace internal {
-
-// Helper struct to specialize ReturnAction to execute a move instead of a copy
-// on return. Useful for move-only types, but could be used on any type.
-template <typename T>
-struct ByMoveWrapper {
-  explicit ByMoveWrapper(T value) : payload(std::move(value)) {}
-  T payload;
-};
-
-// The general implementation of Return(R). Specializations follow below.
-template <typename R>
-class ReturnAction final {
- public:
-  explicit ReturnAction(R value) : value_(std::move(value)) {}
-
-  template <typename U, typename... Args,
-            typename = typename std::enable_if<conjunction<
-                // See the requirements documented on Return.
-                negation<std::is_same<void, U>>,  //
-                negation<std::is_reference<U>>,   //
-                std::is_convertible<R, U>,        //
-                std::is_move_constructible<U>>::value>::type>
-  operator OnceAction<U(Args...)>() && {  // NOLINT
-    return Impl<U>(std::move(value_));
-  }
-
-  template <typename U, typename... Args,
-            typename = typename std::enable_if<conjunction<
-                // See the requirements documented on Return.
-                negation<std::is_same<void, U>>,   //
-                negation<std::is_reference<U>>,    //
-                std::is_convertible<const R&, U>,  //
-                std::is_copy_constructible<U>>::value>::type>
-  operator Action<U(Args...)>() const {  // NOLINT
-    return Impl<U>(value_);
-  }
-
- private:
-  // Implements the Return(x) action for a mock function that returns type U.
-  template <typename U>
-  class Impl final {
-   public:
-    // The constructor used when the return value is allowed to move from the
-    // input value (i.e. we are converting to OnceAction).
-    explicit Impl(R&& input_value)
-        : state_(new State(std::move(input_value))) {}
-
-    // The constructor used when the return value is not allowed to move from
-    // the input value (i.e. we are converting to Action).
-    explicit Impl(const R& input_value) : state_(new State(input_value)) {}
-
-    U operator()() && { return std::move(state_->value); }
-    U operator()() const& { return state_->value; }
-
-   private:
-    // We put our state on the heap so that the compiler-generated copy/move
-    // constructors work correctly even when U is a reference-like type. This is
-    // necessary only because we eagerly create State::value (see the note on
-    // that symbol for details). If we instead had only the input value as a
-    // member then the default constructors would work fine.
-    //
-    // For example, when R is std::string and U is std::string_view, value is a
-    // reference to the string backed by input_value. The copy constructor would
-    // copy both, so that we wind up with a new input_value object (with the
-    // same contents) and a reference to the *old* input_value object rather
-    // than the new one.
-    struct State {
-      explicit State(const R& input_value_in)
-          : input_value(input_value_in),
-            // Make an implicit conversion to Result before initializing the U
-            // object we store, avoiding calling any explicit constructor of U
-            // from R.
-            //
-            // This simulates the language rules: a function with return type U
-            // that does `return R()` requires R to be implicitly convertible to
-            // U, and uses that path for the conversion, even U Result has an
-            // explicit constructor from R.
-            value(ImplicitCast_<U>(internal::as_const(input_value))) {}
-
-      // As above, but for the case where we're moving from the ReturnAction
-      // object because it's being used as a OnceAction.
-      explicit State(R&& input_value_in)
-          : input_value(std::move(input_value_in)),
-            // For the same reason as above we make an implicit conversion to U
-            // before initializing the value.
-            //
-            // Unlike above we provide the input value as an rvalue to the
-            // implicit conversion because this is a OnceAction: it's fine if it
-            // wants to consume the input value.
-            value(ImplicitCast_<U>(std::move(input_value))) {}
-
-      // A copy of the value originally provided by the user. We retain this in
-      // addition to the value of the mock function's result type below in case
-      // the latter is a reference-like type. See the std::string_view example
-      // in the documentation on Return.
-      R input_value;
-
-      // The value we actually return, as the type returned by the mock function
-      // itself.
-      //
-      // We eagerly initialize this here, rather than lazily doing the implicit
-      // conversion automatically each time Perform is called, for historical
-      // reasons: in 2009-11, commit a070cbd91c (Google changelist 13540126)
-      // made the Action<U()> conversion operator eagerly convert the R value to
-      // U, but without keeping the R alive. This broke the use case discussed
-      // in the documentation for Return, making reference-like types such as
-      // std::string_view not safe to use as U where the input type R is a
-      // value-like type such as std::string.
-      //
-      // The example the commit gave was not very clear, nor was the issue
-      // thread (https://github.com/google/googlemock/issues/86), but it seems
-      // the worry was about reference-like input types R that flatten to a
-      // value-like type U when being implicitly converted. An example of this
-      // is std::vector<bool>::reference, which is often a proxy type with an
-      // reference to the underlying vector:
-      //
-      //     // Helper method: have the mock function return bools according
-      //     // to the supplied script.
-      //     void SetActions(MockFunction<bool(size_t)>& mock,
-      //                     const std::vector<bool>& script) {
-      //       for (size_t i = 0; i < script.size(); ++i) {
-      //         EXPECT_CALL(mock, Call(i)).WillOnce(Return(script[i]));
-      //       }
-      //     }
-      //
-      //     TEST(Foo, Bar) {
-      //       // Set actions using a temporary vector, whose operator[]
-      //       // returns proxy objects that references that will be
-      //       // dangling once the call to SetActions finishes and the
-      //       // vector is destroyed.
-      //       MockFunction<bool(size_t)> mock;
-      //       SetActions(mock, {false, true});
-      //
-      //       EXPECT_FALSE(mock.AsStdFunction()(0));
-      //       EXPECT_TRUE(mock.AsStdFunction()(1));
-      //     }
-      //
-      // This eager conversion helps with a simple case like this, but doesn't
-      // fully make these types work in general. For example the following still
-      // uses a dangling reference:
-      //
-      //     TEST(Foo, Baz) {
-      //       MockFunction<std::vector<std::string>()> mock;
-      //
-      //       // Return the same vector twice, and then the empty vector
-      //       // thereafter.
-      //       auto action = Return(std::initializer_list<std::string>{
-      //           "taco", "burrito",
-      //       });
-      //
-      //       EXPECT_CALL(mock, Call)
-      //           .WillOnce(action)
-      //           .WillOnce(action)
-      //           .WillRepeatedly(Return(std::vector<std::string>{}));
-      //
-      //       EXPECT_THAT(mock.AsStdFunction()(),
-      //                   ElementsAre("taco", "burrito"));
-      //       EXPECT_THAT(mock.AsStdFunction()(),
-      //                   ElementsAre("taco", "burrito"));
-      //       EXPECT_THAT(mock.AsStdFunction()(), IsEmpty());
-      //     }
-      //
-      U value;
-    };
-
-    const std::shared_ptr<State> state_;
-  };
-
-  R value_;
-};
-
-// A specialization of ReturnAction<R> when R is ByMoveWrapper<T> for some T.
-//
-// This version applies the type system-defeating hack of moving from T even in
-// the const call operator, checking at runtime that it isn't called more than
-// once, since the user has declared their intent to do so by using ByMove.
-template <typename T>
-class ReturnAction<ByMoveWrapper<T>> final {
- public:
-  explicit ReturnAction(ByMoveWrapper<T> wrapper)
-      : state_(new State(std::move(wrapper.payload))) {}
-
-  T operator()() const {
-    GTEST_CHECK_(!state_->called)
-        << "A ByMove() action must be performed at most once.";
-
-    state_->called = true;
-    return std::move(state_->value);
-  }
-
- private:
-  // We store our state on the heap so that we are copyable as required by
-  // Action, despite the fact that we are stateful and T may not be copyable.
-  struct State {
-    explicit State(T&& value_in) : value(std::move(value_in)) {}
-
-    T value;
-    bool called = false;
-  };
-
-  const std::shared_ptr<State> state_;
-};
-
-// Implements the ReturnNull() action.
-class ReturnNullAction {
- public:
-  // Allows ReturnNull() to be used in any pointer-returning function. In C++11
-  // this is enforced by returning nullptr, and in non-C++11 by asserting a
-  // pointer type on compile time.
-  template <typename Result, typename ArgumentTuple>
-  static Result Perform(const ArgumentTuple&) {
-    return nullptr;
-  }
-};
-
-// Implements the Return() action.
-class ReturnVoidAction {
- public:
-  // Allows Return() to be used in any void-returning function.
-  template <typename Result, typename ArgumentTuple>
-  static void Perform(const ArgumentTuple&) {
-    static_assert(std::is_void<Result>::value, "Result should be void.");
-  }
-};
-
-// Implements the polymorphic ReturnRef(x) action, which can be used
-// in any function that returns a reference to the type of x,
-// regardless of the argument types.
-template <typename T>
-class ReturnRefAction {
- public:
-  // Constructs a ReturnRefAction object from the reference to be returned.
-  explicit ReturnRefAction(T& ref) : ref_(ref) {}  // NOLINT
-
-  // This template type conversion operator allows ReturnRef(x) to be
-  // used in ANY function that returns a reference to x's type.
-  template <typename F>
-  operator Action<F>() const {
-    typedef typename Function<F>::Result Result;
-    // Asserts that the function return type is a reference.  This
-    // catches the user error of using ReturnRef(x) when Return(x)
-    // should be used, and generates some helpful error message.
-    static_assert(std::is_reference<Result>::value,
-                  "use Return instead of ReturnRef to return a value");
-    return Action<F>(new Impl<F>(ref_));
-  }
-
- private:
-  // Implements the ReturnRef(x) action for a particular function type F.
-  template <typename F>
-  class Impl : public ActionInterface<F> {
-   public:
-    typedef typename Function<F>::Result Result;
-    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
-
-    explicit Impl(T& ref) : ref_(ref) {}  // NOLINT
-
-    Result Perform(const ArgumentTuple&) override { return ref_; }
-
-   private:
-    T& ref_;
-  };
-
-  T& ref_;
-};
-
-// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
-// used in any function that returns a reference to the type of x,
-// regardless of the argument types.
-template <typename T>
-class ReturnRefOfCopyAction {
- public:
-  // Constructs a ReturnRefOfCopyAction object from the reference to
-  // be returned.
-  explicit ReturnRefOfCopyAction(const T& value) : value_(value) {}  // NOLINT
-
-  // This template type conversion operator allows ReturnRefOfCopy(x) to be
-  // used in ANY function that returns a reference to x's type.
-  template <typename F>
-  operator Action<F>() const {
-    typedef typename Function<F>::Result Result;
-    // Asserts that the function return type is a reference.  This
-    // catches the user error of using ReturnRefOfCopy(x) when Return(x)
-    // should be used, and generates some helpful error message.
-    static_assert(std::is_reference<Result>::value,
-                  "use Return instead of ReturnRefOfCopy to return a value");
-    return Action<F>(new Impl<F>(value_));
-  }
-
- private:
-  // Implements the ReturnRefOfCopy(x) action for a particular function type F.
-  template <typename F>
-  class Impl : public ActionInterface<F> {
-   public:
-    typedef typename Function<F>::Result Result;
-    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
-
-    explicit Impl(const T& value) : value_(value) {}  // NOLINT
-
-    Result Perform(const ArgumentTuple&) override { return value_; }
-
-   private:
-    T value_;
-  };
-
-  const T value_;
-};
-
-// Implements the polymorphic ReturnRoundRobin(v) action, which can be
-// used in any function that returns the element_type of v.
-template <typename T>
-class ReturnRoundRobinAction {
- public:
-  explicit ReturnRoundRobinAction(std::vector<T> values) {
-    GTEST_CHECK_(!values.empty())
-        << "ReturnRoundRobin requires at least one element.";
-    state_->values = std::move(values);
-  }
-
-  template <typename... Args>
-  T operator()(Args&&...) const {
-    return state_->Next();
-  }
-
- private:
-  struct State {
-    T Next() {
-      T ret_val = values[i++];
-      if (i == values.size()) i = 0;
-      return ret_val;
-    }
-
-    std::vector<T> values;
-    size_t i = 0;
-  };
-  std::shared_ptr<State> state_ = std::make_shared<State>();
-};
-
-// Implements the polymorphic DoDefault() action.
-class DoDefaultAction {
- public:
-  // This template type conversion operator allows DoDefault() to be
-  // used in any function.
-  template <typename F>
-  operator Action<F>() const {
-    return Action<F>();
-  }  // NOLINT
-};
-
-// Implements the Assign action to set a given pointer referent to a
-// particular value.
-template <typename T1, typename T2>
-class AssignAction {
- public:
-  AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
-
-  template <typename Result, typename ArgumentTuple>
-  void Perform(const ArgumentTuple& /* args */) const {
-    *ptr_ = value_;
-  }
-
- private:
-  T1* const ptr_;
-  const T2 value_;
-};
-
-#if !GTEST_OS_WINDOWS_MOBILE
-
-// Implements the SetErrnoAndReturn action to simulate return from
-// various system calls and libc functions.
-template <typename T>
-class SetErrnoAndReturnAction {
- public:
-  SetErrnoAndReturnAction(int errno_value, T result)
-      : errno_(errno_value), result_(result) {}
-  template <typename Result, typename ArgumentTuple>
-  Result Perform(const ArgumentTuple& /* args */) const {
-    errno = errno_;
-    return result_;
-  }
-
- private:
-  const int errno_;
-  const T result_;
-};
-
-#endif  // !GTEST_OS_WINDOWS_MOBILE
-
-// Implements the SetArgumentPointee<N>(x) action for any function
-// whose N-th argument (0-based) is a pointer to x's type.
-template <size_t N, typename A, typename = void>
-struct SetArgumentPointeeAction {
-  A value;
-
-  template <typename... Args>
-  void operator()(const Args&... args) const {
-    *::std::get<N>(std::tie(args...)) = value;
-  }
-};
-
-// Implements the Invoke(object_ptr, &Class::Method) action.
-template <class Class, typename MethodPtr>
-struct InvokeMethodAction {
-  Class* const obj_ptr;
-  const MethodPtr method_ptr;
-
-  template <typename... Args>
-  auto operator()(Args&&... args) const
-      -> decltype((obj_ptr->*method_ptr)(std::forward<Args>(args)...)) {
-    return (obj_ptr->*method_ptr)(std::forward<Args>(args)...);
-  }
-};
-
-// Implements the InvokeWithoutArgs(f) action.  The template argument
-// FunctionImpl is the implementation type of f, which can be either a
-// function pointer or a functor.  InvokeWithoutArgs(f) can be used as an
-// Action<F> as long as f's type is compatible with F.
-template <typename FunctionImpl>
-struct InvokeWithoutArgsAction {
-  FunctionImpl function_impl;
-
-  // Allows InvokeWithoutArgs(f) to be used as any action whose type is
-  // compatible with f.
-  template <typename... Args>
-  auto operator()(const Args&...) -> decltype(function_impl()) {
-    return function_impl();
-  }
-};
-
-// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
-template <class Class, typename MethodPtr>
-struct InvokeMethodWithoutArgsAction {
-  Class* const obj_ptr;
-  const MethodPtr method_ptr;
-
-  using ReturnType =
-      decltype((std::declval<Class*>()->*std::declval<MethodPtr>())());
-
-  template <typename... Args>
-  ReturnType operator()(const Args&...) const {
-    return (obj_ptr->*method_ptr)();
-  }
-};
-
-// Implements the IgnoreResult(action) action.
-template <typename A>
-class IgnoreResultAction {
- public:
-  explicit IgnoreResultAction(const A& action) : action_(action) {}
-
-  template <typename F>
-  operator Action<F>() const {
-    // Assert statement belongs here because this is the best place to verify
-    // conditions on F. It produces the clearest error messages
-    // in most compilers.
-    // Impl really belongs in this scope as a local class but can't
-    // because MSVC produces duplicate symbols in different translation units
-    // in this case. Until MS fixes that bug we put Impl into the class scope
-    // and put the typedef both here (for use in assert statement) and
-    // in the Impl class. But both definitions must be the same.
-    typedef typename internal::Function<F>::Result Result;
-
-    // Asserts at compile time that F returns void.
-    static_assert(std::is_void<Result>::value, "Result type should be void.");
-
-    return Action<F>(new Impl<F>(action_));
-  }
-
- private:
-  template <typename F>
-  class Impl : public ActionInterface<F> {
-   public:
-    typedef typename internal::Function<F>::Result Result;
-    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-
-    explicit Impl(const A& action) : action_(action) {}
-
-    void Perform(const ArgumentTuple& args) override {
-      // Performs the action and ignores its result.
-      action_.Perform(args);
-    }
-
-   private:
-    // Type OriginalFunction is the same as F except that its return
-    // type is IgnoredValue.
-    typedef
-        typename internal::Function<F>::MakeResultIgnoredValue OriginalFunction;
-
-    const Action<OriginalFunction> action_;
-  };
-
-  const A action_;
-};
-
-template <typename InnerAction, size_t... I>
-struct WithArgsAction {
-  InnerAction inner_action;
-
-  // The signature of the function as seen by the inner action, given an out
-  // action with the given result and argument types.
-  template <typename R, typename... Args>
-  using InnerSignature =
-      R(typename std::tuple_element<I, std::tuple<Args...>>::type...);
-
-  // Rather than a call operator, we must define conversion operators to
-  // particular action types. This is necessary for embedded actions like
-  // DoDefault(), which rely on an action conversion operators rather than
-  // providing a call operator because even with a particular set of arguments
-  // they don't have a fixed return type.
-
-  template <typename R, typename... Args,
-            typename std::enable_if<
-                std::is_convertible<
-                    InnerAction,
-                    // Unfortunately we can't use the InnerSignature alias here;
-                    // MSVC complains about the I parameter pack not being
-                    // expanded (error C3520) despite it being expanded in the
-                    // type alias.
-                    OnceAction<R(typename std::tuple_element<
-                                 I, std::tuple<Args...>>::type...)>>::value,
-                int>::type = 0>
-  operator OnceAction<R(Args...)>() && {  // NOLINT
-    struct OA {
-      OnceAction<InnerSignature<R, Args...>> inner_action;
-
-      R operator()(Args&&... args) && {
-        return std::move(inner_action)
-            .Call(std::get<I>(
-                std::forward_as_tuple(std::forward<Args>(args)...))...);
-      }
-    };
-
-    return OA{std::move(inner_action)};
-  }
-
-  template <typename R, typename... Args,
-            typename std::enable_if<
-                std::is_convertible<
-                    const InnerAction&,
-                    // Unfortunately we can't use the InnerSignature alias here;
-                    // MSVC complains about the I parameter pack not being
-                    // expanded (error C3520) despite it being expanded in the
-                    // type alias.
-                    Action<R(typename std::tuple_element<
-                             I, std::tuple<Args...>>::type...)>>::value,
-                int>::type = 0>
-  operator Action<R(Args...)>() const {  // NOLINT
-    Action<InnerSignature<R, Args...>> converted(inner_action);
-
-    return [converted](Args&&... args) -> R {
-      return converted.Perform(std::forward_as_tuple(
-          std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
-    };
-  }
-};
-
-template <typename... Actions>
-class DoAllAction;
-
-// Base case: only a single action.
-template <typename FinalAction>
-class DoAllAction<FinalAction> {
- public:
-  struct UserConstructorTag {};
-
-  template <typename T>
-  explicit DoAllAction(UserConstructorTag, T&& action)
-      : final_action_(std::forward<T>(action)) {}
-
-  // Rather than a call operator, we must define conversion operators to
-  // particular action types. This is necessary for embedded actions like
-  // DoDefault(), which rely on an action conversion operators rather than
-  // providing a call operator because even with a particular set of arguments
-  // they don't have a fixed return type.
-
-  template <typename R, typename... Args,
-            typename std::enable_if<
-                std::is_convertible<FinalAction, OnceAction<R(Args...)>>::value,
-                int>::type = 0>
-  operator OnceAction<R(Args...)>() && {  // NOLINT
-    return std::move(final_action_);
-  }
-
-  template <
-      typename R, typename... Args,
-      typename std::enable_if<
-          std::is_convertible<const FinalAction&, Action<R(Args...)>>::value,
-          int>::type = 0>
-  operator Action<R(Args...)>() const {  // NOLINT
-    return final_action_;
-  }
-
- private:
-  FinalAction final_action_;
-};
-
-// Recursive case: support N actions by calling the initial action and then
-// calling through to the base class containing N-1 actions.
-template <typename InitialAction, typename... OtherActions>
-class DoAllAction<InitialAction, OtherActions...>
-    : private DoAllAction<OtherActions...> {
- private:
-  using Base = DoAllAction<OtherActions...>;
-
-  // The type of reference that should be provided to an initial action for a
-  // mocked function parameter of type T.
-  //
-  // There are two quirks here:
-  //
-  //  *  Unlike most forwarding functions, we pass scalars through by value.
-  //     This isn't strictly necessary because an lvalue reference would work
-  //     fine too and be consistent with other non-reference types, but it's
-  //     perhaps less surprising.
-  //
-  //     For example if the mocked function has signature void(int), then it
-  //     might seem surprising for the user's initial action to need to be
-  //     convertible to Action<void(const int&)>. This is perhaps less
-  //     surprising for a non-scalar type where there may be a performance
-  //     impact, or it might even be impossible, to pass by value.
-  //
-  //  *  More surprisingly, `const T&` is often not a const reference type.
-  //     By the reference collapsing rules in C++17 [dcl.ref]/6, if T refers to
-  //     U& or U&& for some non-scalar type U, then InitialActionArgType<T> is
-  //     U&. In other words, we may hand over a non-const reference.
-  //
-  //     So for example, given some non-scalar type Obj we have the following
-  //     mappings:
-  //
-  //            T               InitialActionArgType<T>
-  //         -------            -----------------------
-  //         Obj                const Obj&
-  //         Obj&               Obj&
-  //         Obj&&              Obj&
-  //         const Obj          const Obj&
-  //         const Obj&         const Obj&
-  //         const Obj&&        const Obj&
-  //
-  //     In other words, the initial actions get a mutable view of an non-scalar
-  //     argument if and only if the mock function itself accepts a non-const
-  //     reference type. They are never given an rvalue reference to an
-  //     non-scalar type.
-  //
-  //     This situation makes sense if you imagine use with a matcher that is
-  //     designed to write through a reference. For example, if the caller wants
-  //     to fill in a reference argument and then return a canned value:
-  //
-  //         EXPECT_CALL(mock, Call)
-  //             .WillOnce(DoAll(SetArgReferee<0>(17), Return(19)));
-  //
-  template <typename T>
-  using InitialActionArgType =
-      typename std::conditional<std::is_scalar<T>::value, T, const T&>::type;
-
- public:
-  struct UserConstructorTag {};
-
-  template <typename T, typename... U>
-  explicit DoAllAction(UserConstructorTag, T&& initial_action,
-                       U&&... other_actions)
-      : Base({}, std::forward<U>(other_actions)...),
-        initial_action_(std::forward<T>(initial_action)) {}
-
-  template <typename R, typename... Args,
-            typename std::enable_if<
-                conjunction<
-                    // Both the initial action and the rest must support
-                    // conversion to OnceAction.
-                    std::is_convertible<
-                        InitialAction,
-                        OnceAction<void(InitialActionArgType<Args>...)>>,
-                    std::is_convertible<Base, OnceAction<R(Args...)>>>::value,
-                int>::type = 0>
-  operator OnceAction<R(Args...)>() && {  // NOLINT
-    // Return an action that first calls the initial action with arguments
-    // filtered through InitialActionArgType, then forwards arguments directly
-    // to the base class to deal with the remaining actions.
-    struct OA {
-      OnceAction<void(InitialActionArgType<Args>...)> initial_action;
-      OnceAction<R(Args...)> remaining_actions;
-
-      R operator()(Args... args) && {
-        std::move(initial_action)
-            .Call(static_cast<InitialActionArgType<Args>>(args)...);
-
-        return std::move(remaining_actions).Call(std::forward<Args>(args)...);
-      }
-    };
-
-    return OA{
-        std::move(initial_action_),
-        std::move(static_cast<Base&>(*this)),
-    };
-  }
-
-  template <
-      typename R, typename... Args,
-      typename std::enable_if<
-          conjunction<
-              // Both the initial action and the rest must support conversion to
-              // Action.
-              std::is_convertible<const InitialAction&,
-                                  Action<void(InitialActionArgType<Args>...)>>,
-              std::is_convertible<const Base&, Action<R(Args...)>>>::value,
-          int>::type = 0>
-  operator Action<R(Args...)>() const {  // NOLINT
-    // Return an action that first calls the initial action with arguments
-    // filtered through InitialActionArgType, then forwards arguments directly
-    // to the base class to deal with the remaining actions.
-    struct OA {
-      Action<void(InitialActionArgType<Args>...)> initial_action;
-      Action<R(Args...)> remaining_actions;
-
-      R operator()(Args... args) const {
-        initial_action.Perform(std::forward_as_tuple(
-            static_cast<InitialActionArgType<Args>>(args)...));
-
-        return remaining_actions.Perform(
-            std::forward_as_tuple(std::forward<Args>(args)...));
-      }
-    };
-
-    return OA{
-        initial_action_,
-        static_cast<const Base&>(*this),
-    };
-  }
-
- private:
-  InitialAction initial_action_;
-};
-
-template <typename T, typename... Params>
-struct ReturnNewAction {
-  T* operator()() const {
-    return internal::Apply(
-        [](const Params&... unpacked_params) {
-          return new T(unpacked_params...);
-        },
-        params);
-  }
-  std::tuple<Params...> params;
-};
-
-template <size_t k>
-struct ReturnArgAction {
-  template <typename... Args,
-            typename = typename std::enable_if<(k < sizeof...(Args))>::type>
-  auto operator()(Args&&... args) const -> decltype(std::get<k>(
-      std::forward_as_tuple(std::forward<Args>(args)...))) {
-    return std::get<k>(std::forward_as_tuple(std::forward<Args>(args)...));
-  }
-};
-
-template <size_t k, typename Ptr>
-struct SaveArgAction {
-  Ptr pointer;
-
-  template <typename... Args>
-  void operator()(const Args&... args) const {
-    *pointer = std::get<k>(std::tie(args...));
-  }
-};
-
-template <size_t k, typename Ptr>
-struct SaveArgPointeeAction {
-  Ptr pointer;
-
-  template <typename... Args>
-  void operator()(const Args&... args) const {
-    *pointer = *std::get<k>(std::tie(args...));
-  }
-};
-
-template <size_t k, typename T>
-struct SetArgRefereeAction {
-  T value;
-
-  template <typename... Args>
-  void operator()(Args&&... args) const {
-    using argk_type =
-        typename ::std::tuple_element<k, std::tuple<Args...>>::type;
-    static_assert(std::is_lvalue_reference<argk_type>::value,
-                  "Argument must be a reference type.");
-    std::get<k>(std::tie(args...)) = value;
-  }
-};
-
-template <size_t k, typename I1, typename I2>
-struct SetArrayArgumentAction {
-  I1 first;
-  I2 last;
-
-  template <typename... Args>
-  void operator()(const Args&... args) const {
-    auto value = std::get<k>(std::tie(args...));
-    for (auto it = first; it != last; ++it, (void)++value) {
-      *value = *it;
-    }
-  }
-};
-
-template <size_t k>
-struct DeleteArgAction {
-  template <typename... Args>
-  void operator()(const Args&... args) const {
-    delete std::get<k>(std::tie(args...));
-  }
-};
-
-template <typename Ptr>
-struct ReturnPointeeAction {
-  Ptr pointer;
-  template <typename... Args>
-  auto operator()(const Args&...) const -> decltype(*pointer) {
-    return *pointer;
-  }
-};
-
-#if GTEST_HAS_EXCEPTIONS
-template <typename T>
-struct ThrowAction {
-  T exception;
-  // We use a conversion operator to adapt to any return type.
-  template <typename R, typename... Args>
-  operator Action<R(Args...)>() const {  // NOLINT
-    T copy = exception;
-    return [copy](Args...) -> R { throw copy; };
-  }
-};
-#endif  // GTEST_HAS_EXCEPTIONS
-
-}  // namespace internal
-
-// An Unused object can be implicitly constructed from ANY value.
-// This is handy when defining actions that ignore some or all of the
-// mock function arguments.  For example, given
-//
-//   MOCK_METHOD3(Foo, double(const string& label, double x, double y));
-//   MOCK_METHOD3(Bar, double(int index, double x, double y));
-//
-// instead of
-//
-//   double DistanceToOriginWithLabel(const string& label, double x, double y) {
-//     return sqrt(x*x + y*y);
-//   }
-//   double DistanceToOriginWithIndex(int index, double x, double y) {
-//     return sqrt(x*x + y*y);
-//   }
-//   ...
-//   EXPECT_CALL(mock, Foo("abc", _, _))
-//       .WillOnce(Invoke(DistanceToOriginWithLabel));
-//   EXPECT_CALL(mock, Bar(5, _, _))
-//       .WillOnce(Invoke(DistanceToOriginWithIndex));
-//
-// you could write
-//
-//   // We can declare any uninteresting argument as Unused.
-//   double DistanceToOrigin(Unused, double x, double y) {
-//     return sqrt(x*x + y*y);
-//   }
-//   ...
-//   EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
-//   EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
-typedef internal::IgnoredValue Unused;
-
-// Creates an action that does actions a1, a2, ..., sequentially in
-// each invocation. All but the last action will have a readonly view of the
-// arguments.
-template <typename... Action>
-internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
-    Action&&... action) {
-  return internal::DoAllAction<typename std::decay<Action>::type...>(
-      {}, std::forward<Action>(action)...);
-}
-
-// WithArg<k>(an_action) creates an action that passes the k-th
-// (0-based) argument of the mock function to an_action and performs
-// it.  It adapts an action accepting one argument to one that accepts
-// multiple arguments.  For convenience, we also provide
-// WithArgs<k>(an_action) (defined below) as a synonym.
-template <size_t k, typename InnerAction>
-internal::WithArgsAction<typename std::decay<InnerAction>::type, k> WithArg(
-    InnerAction&& action) {
-  return {std::forward<InnerAction>(action)};
-}
-
-// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
-// the selected arguments of the mock function to an_action and
-// performs it.  It serves as an adaptor between actions with
-// different argument lists.
-template <size_t k, size_t... ks, typename InnerAction>
-internal::WithArgsAction<typename std::decay<InnerAction>::type, k, ks...>
-WithArgs(InnerAction&& action) {
-  return {std::forward<InnerAction>(action)};
-}
-
-// WithoutArgs(inner_action) can be used in a mock function with a
-// non-empty argument list to perform inner_action, which takes no
-// argument.  In other words, it adapts an action accepting no
-// argument to one that accepts (and ignores) arguments.
-template <typename InnerAction>
-internal::WithArgsAction<typename std::decay<InnerAction>::type> WithoutArgs(
-    InnerAction&& action) {
-  return {std::forward<InnerAction>(action)};
-}
-
-// Creates an action that returns a value.
-//
-// The returned type can be used with a mock function returning a non-void,
-// non-reference type U as follows:
-//
-//  *  If R is convertible to U and U is move-constructible, then the action can
-//     be used with WillOnce.
-//
-//  *  If const R& is convertible to U and U is copy-constructible, then the
-//     action can be used with both WillOnce and WillRepeatedly.
-//
-// The mock expectation contains the R value from which the U return value is
-// constructed (a move/copy of the argument to Return). This means that the R
-// value will survive at least until the mock object's expectations are cleared
-// or the mock object is destroyed, meaning that U can safely be a
-// reference-like type such as std::string_view:
-//
-//     // The mock function returns a view of a copy of the string fed to
-//     // Return. The view is valid even after the action is performed.
-//     MockFunction<std::string_view()> mock;
-//     EXPECT_CALL(mock, Call).WillOnce(Return(std::string("taco")));
-//     const std::string_view result = mock.AsStdFunction()();
-//     EXPECT_EQ("taco", result);
-//
-template <typename R>
-internal::ReturnAction<R> Return(R value) {
-  return internal::ReturnAction<R>(std::move(value));
-}
-
-// Creates an action that returns NULL.
-inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
-  return MakePolymorphicAction(internal::ReturnNullAction());
-}
-
-// Creates an action that returns from a void function.
-inline PolymorphicAction<internal::ReturnVoidAction> Return() {
-  return MakePolymorphicAction(internal::ReturnVoidAction());
-}
-
-// Creates an action that returns the reference to a variable.
-template <typename R>
-inline internal::ReturnRefAction<R> ReturnRef(R& x) {  // NOLINT
-  return internal::ReturnRefAction<R>(x);
-}
-
-// Prevent using ReturnRef on reference to temporary.
-template <typename R, R* = nullptr>
-internal::ReturnRefAction<R> ReturnRef(R&&) = delete;
-
-// Creates an action that returns the reference to a copy of the
-// argument.  The copy is created when the action is constructed and
-// lives as long as the action.
-template <typename R>
-inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
-  return internal::ReturnRefOfCopyAction<R>(x);
-}
-
-// DEPRECATED: use Return(x) directly with WillOnce.
-//
-// Modifies the parent action (a Return() action) to perform a move of the
-// argument instead of a copy.
-// Return(ByMove()) actions can only be executed once and will assert this
-// invariant.
-template <typename R>
-internal::ByMoveWrapper<R> ByMove(R x) {
-  return internal::ByMoveWrapper<R>(std::move(x));
-}
-
-// Creates an action that returns an element of `vals`. Calling this action will
-// repeatedly return the next value from `vals` until it reaches the end and
-// will restart from the beginning.
-template <typename T>
-internal::ReturnRoundRobinAction<T> ReturnRoundRobin(std::vector<T> vals) {
-  return internal::ReturnRoundRobinAction<T>(std::move(vals));
-}
-
-// Creates an action that returns an element of `vals`. Calling this action will
-// repeatedly return the next value from `vals` until it reaches the end and
-// will restart from the beginning.
-template <typename T>
-internal::ReturnRoundRobinAction<T> ReturnRoundRobin(
-    std::initializer_list<T> vals) {
-  return internal::ReturnRoundRobinAction<T>(std::vector<T>(vals));
-}
-
-// Creates an action that does the default action for the give mock function.
-inline internal::DoDefaultAction DoDefault() {
-  return internal::DoDefaultAction();
-}
-
-// Creates an action that sets the variable pointed by the N-th
-// (0-based) function argument to 'value'.
-template <size_t N, typename T>
-internal::SetArgumentPointeeAction<N, T> SetArgPointee(T value) {
-  return {std::move(value)};
-}
-
-// The following version is DEPRECATED.
-template <size_t N, typename T>
-internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T value) {
-  return {std::move(value)};
-}
-
-// Creates an action that sets a pointer referent to a given value.
-template <typename T1, typename T2>
-PolymorphicAction<internal::AssignAction<T1, T2>> Assign(T1* ptr, T2 val) {
-  return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
-}
-
-#if !GTEST_OS_WINDOWS_MOBILE
-
-// Creates an action that sets errno and returns the appropriate error.
-template <typename T>
-PolymorphicAction<internal::SetErrnoAndReturnAction<T>> SetErrnoAndReturn(
-    int errval, T result) {
-  return MakePolymorphicAction(
-      internal::SetErrnoAndReturnAction<T>(errval, result));
-}
-
-#endif  // !GTEST_OS_WINDOWS_MOBILE
-
-// Various overloads for Invoke().
-
-// Legacy function.
-// Actions can now be implicitly constructed from callables. No need to create
-// wrapper objects.
-// This function exists for backwards compatibility.
-template <typename FunctionImpl>
-typename std::decay<FunctionImpl>::type Invoke(FunctionImpl&& function_impl) {
-  return std::forward<FunctionImpl>(function_impl);
-}
-
-// Creates an action that invokes the given method on the given object
-// with the mock function's arguments.
-template <class Class, typename MethodPtr>
-internal::InvokeMethodAction<Class, MethodPtr> Invoke(Class* obj_ptr,
-                                                      MethodPtr method_ptr) {
-  return {obj_ptr, method_ptr};
-}
-
-// Creates an action that invokes 'function_impl' with no argument.
-template <typename FunctionImpl>
-internal::InvokeWithoutArgsAction<typename std::decay<FunctionImpl>::type>
-InvokeWithoutArgs(FunctionImpl function_impl) {
-  return {std::move(function_impl)};
-}
-
-// Creates an action that invokes the given method on the given object
-// with no argument.
-template <class Class, typename MethodPtr>
-internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> InvokeWithoutArgs(
-    Class* obj_ptr, MethodPtr method_ptr) {
-  return {obj_ptr, method_ptr};
-}
-
-// Creates an action that performs an_action and throws away its
-// result.  In other words, it changes the return type of an_action to
-// void.  an_action MUST NOT return void, or the code won't compile.
-template <typename A>
-inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
-  return internal::IgnoreResultAction<A>(an_action);
-}
-
-// Creates a reference wrapper for the given L-value.  If necessary,
-// you can explicitly specify the type of the reference.  For example,
-// suppose 'derived' is an object of type Derived, ByRef(derived)
-// would wrap a Derived&.  If you want to wrap a const Base& instead,
-// where Base is a base class of Derived, just write:
-//
-//   ByRef<const Base>(derived)
-//
-// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper.
-// However, it may still be used for consistency with ByMove().
-template <typename T>
-inline ::std::reference_wrapper<T> ByRef(T& l_value) {  // NOLINT
-  return ::std::reference_wrapper<T>(l_value);
-}
-
-// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
-// instance of type T, constructed on the heap with constructor arguments
-// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
-template <typename T, typename... Params>
-internal::ReturnNewAction<T, typename std::decay<Params>::type...> ReturnNew(
-    Params&&... params) {
-  return {std::forward_as_tuple(std::forward<Params>(params)...)};
-}
-
-// Action ReturnArg<k>() returns the k-th argument of the mock function.
-template <size_t k>
-internal::ReturnArgAction<k> ReturnArg() {
-  return {};
-}
-
-// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
-// mock function to *pointer.
-template <size_t k, typename Ptr>
-internal::SaveArgAction<k, Ptr> SaveArg(Ptr pointer) {
-  return {pointer};
-}
-
-// Action SaveArgPointee<k>(pointer) saves the value pointed to
-// by the k-th (0-based) argument of the mock function to *pointer.
-template <size_t k, typename Ptr>
-internal::SaveArgPointeeAction<k, Ptr> SaveArgPointee(Ptr pointer) {
-  return {pointer};
-}
-
-// Action SetArgReferee<k>(value) assigns 'value' to the variable
-// referenced by the k-th (0-based) argument of the mock function.
-template <size_t k, typename T>
-internal::SetArgRefereeAction<k, typename std::decay<T>::type> SetArgReferee(
-    T&& value) {
-  return {std::forward<T>(value)};
-}
-
-// Action SetArrayArgument<k>(first, last) copies the elements in
-// source range [first, last) to the array pointed to by the k-th
-// (0-based) argument, which can be either a pointer or an
-// iterator. The action does not take ownership of the elements in the
-// source range.
-template <size_t k, typename I1, typename I2>
-internal::SetArrayArgumentAction<k, I1, I2> SetArrayArgument(I1 first,
-                                                             I2 last) {
-  return {first, last};
-}
-
-// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
-// function.
-template <size_t k>
-internal::DeleteArgAction<k> DeleteArg() {
-  return {};
-}
-
-// This action returns the value pointed to by 'pointer'.
-template <typename Ptr>
-internal::ReturnPointeeAction<Ptr> ReturnPointee(Ptr pointer) {
-  return {pointer};
-}
-
-// Action Throw(exception) can be used in a mock function of any type
-// to throw the given exception.  Any copyable value can be thrown.
-#if GTEST_HAS_EXCEPTIONS
-template <typename T>
-internal::ThrowAction<typename std::decay<T>::type> Throw(T&& exception) {
-  return {std::forward<T>(exception)};
-}
-#endif  // GTEST_HAS_EXCEPTIONS
-
-namespace internal {
-
-// A macro from the ACTION* family (defined later in gmock-generated-actions.h)
-// defines an action that can be used in a mock function.  Typically,
-// these actions only care about a subset of the arguments of the mock
-// function.  For example, if such an action only uses the second
-// argument, it can be used in any mock function that takes >= 2
-// arguments where the type of the second argument is compatible.
-//
-// Therefore, the action implementation must be prepared to take more
-// arguments than it needs.  The ExcessiveArg type is used to
-// represent those excessive arguments.  In order to keep the compiler
-// error messages tractable, we define it in the testing namespace
-// instead of testing::internal.  However, this is an INTERNAL TYPE
-// and subject to change without notice, so a user MUST NOT USE THIS
-// TYPE DIRECTLY.
-struct ExcessiveArg {};
-
-// Builds an implementation of an Action<> for some particular signature, using
-// a class defined by an ACTION* macro.
-template <typename F, typename Impl>
-struct ActionImpl;
-
-template <typename Impl>
-struct ImplBase {
-  struct Holder {
-    // Allows each copy of the Action<> to get to the Impl.
-    explicit operator const Impl&() const { return *ptr; }
-    std::shared_ptr<Impl> ptr;
-  };
-  using type = typename std::conditional<std::is_constructible<Impl>::value,
-                                         Impl, Holder>::type;
-};
-
-template <typename R, typename... Args, typename Impl>
-struct ActionImpl<R(Args...), Impl> : ImplBase<Impl>::type {
-  using Base = typename ImplBase<Impl>::type;
-  using function_type = R(Args...);
-  using args_type = std::tuple<Args...>;
-
-  ActionImpl() = default;  // Only defined if appropriate for Base.
-  explicit ActionImpl(std::shared_ptr<Impl> impl) : Base{std::move(impl)} {}
-
-  R operator()(Args&&... arg) const {
-    static constexpr size_t kMaxArgs =
-        sizeof...(Args) <= 10 ? sizeof...(Args) : 10;
-    return Apply(MakeIndexSequence<kMaxArgs>{},
-                 MakeIndexSequence<10 - kMaxArgs>{},
-                 args_type{std::forward<Args>(arg)...});
-  }
-
-  template <std::size_t... arg_id, std::size_t... excess_id>
-  R Apply(IndexSequence<arg_id...>, IndexSequence<excess_id...>,
-          const args_type& args) const {
-    // Impl need not be specific to the signature of action being implemented;
-    // only the implementing function body needs to have all of the specific
-    // types instantiated.  Up to 10 of the args that are provided by the
-    // args_type get passed, followed by a dummy of unspecified type for the
-    // remainder up to 10 explicit args.
-    static constexpr ExcessiveArg kExcessArg{};
-    return static_cast<const Impl&>(*this)
-        .template gmock_PerformImpl<
-            /*function_type=*/function_type, /*return_type=*/R,
-            /*args_type=*/args_type,
-            /*argN_type=*/
-            typename std::tuple_element<arg_id, args_type>::type...>(
-            /*args=*/args, std::get<arg_id>(args)...,
-            ((void)excess_id, kExcessArg)...);
-  }
-};
-
-// Stores a default-constructed Impl as part of the Action<>'s
-// std::function<>. The Impl should be trivial to copy.
-template <typename F, typename Impl>
-::testing::Action<F> MakeAction() {
-  return ::testing::Action<F>(ActionImpl<F, Impl>());
-}
-
-// Stores just the one given instance of Impl.
-template <typename F, typename Impl>
-::testing::Action<F> MakeAction(std::shared_ptr<Impl> impl) {
-  return ::testing::Action<F>(ActionImpl<F, Impl>(std::move(impl)));
-}
-
-#define GMOCK_INTERNAL_ARG_UNUSED(i, data, el) \
-  , const arg##i##_type& arg##i GTEST_ATTRIBUTE_UNUSED_
-#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_                 \
-  const args_type& args GTEST_ATTRIBUTE_UNUSED_ GMOCK_PP_REPEAT( \
-      GMOCK_INTERNAL_ARG_UNUSED, , 10)
-
-#define GMOCK_INTERNAL_ARG(i, data, el) , const arg##i##_type& arg##i
-#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_ \
-  const args_type& args GMOCK_PP_REPEAT(GMOCK_INTERNAL_ARG, , 10)
-
-#define GMOCK_INTERNAL_TEMPLATE_ARG(i, data, el) , typename arg##i##_type
-#define GMOCK_ACTION_TEMPLATE_ARGS_NAMES_ \
-  GMOCK_PP_TAIL(GMOCK_PP_REPEAT(GMOCK_INTERNAL_TEMPLATE_ARG, , 10))
-
-#define GMOCK_INTERNAL_TYPENAME_PARAM(i, data, param) , typename param##_type
-#define GMOCK_ACTION_TYPENAME_PARAMS_(params) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPENAME_PARAM, , params))
-
-#define GMOCK_INTERNAL_TYPE_PARAM(i, data, param) , param##_type
-#define GMOCK_ACTION_TYPE_PARAMS_(params) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_PARAM, , params))
-
-#define GMOCK_INTERNAL_TYPE_GVALUE_PARAM(i, data, param) \
-  , param##_type gmock_p##i
-#define GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_GVALUE_PARAM, , params))
-
-#define GMOCK_INTERNAL_GVALUE_PARAM(i, data, param) \
-  , std::forward<param##_type>(gmock_p##i)
-#define GMOCK_ACTION_GVALUE_PARAMS_(params) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GVALUE_PARAM, , params))
-
-#define GMOCK_INTERNAL_INIT_PARAM(i, data, param) \
-  , param(::std::forward<param##_type>(gmock_p##i))
-#define GMOCK_ACTION_INIT_PARAMS_(params) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_INIT_PARAM, , params))
-
-#define GMOCK_INTERNAL_FIELD_PARAM(i, data, param) param##_type param;
-#define GMOCK_ACTION_FIELD_PARAMS_(params) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_FIELD_PARAM, , params)
-
-#define GMOCK_INTERNAL_ACTION(name, full_name, params)                         \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
-  class full_name {                                                            \
-   public:                                                                     \
-    explicit full_name(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))               \
-        : impl_(std::make_shared<gmock_Impl>(                                  \
-              GMOCK_ACTION_GVALUE_PARAMS_(params))) {}                         \
-    full_name(const full_name&) = default;                                     \
-    full_name(full_name&&) noexcept = default;                                 \
-    template <typename F>                                                      \
-    operator ::testing::Action<F>() const {                                    \
-      return ::testing::internal::MakeAction<F>(impl_);                        \
-    }                                                                          \
-                                                                               \
-   private:                                                                    \
-    class gmock_Impl {                                                         \
-     public:                                                                   \
-      explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))            \
-          : GMOCK_ACTION_INIT_PARAMS_(params) {}                               \
-      template <typename function_type, typename return_type,                  \
-                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>         \
-      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;  \
-      GMOCK_ACTION_FIELD_PARAMS_(params)                                       \
-    };                                                                         \
-    std::shared_ptr<const gmock_Impl> impl_;                                   \
-  };                                                                           \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
-  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                    \
-      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) GTEST_MUST_USE_RESULT_;        \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
-  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                    \
-      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) {                              \
-    return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>(                       \
-        GMOCK_ACTION_GVALUE_PARAMS_(params));                                  \
-  }                                                                            \
-  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                             \
-  template <typename function_type, typename return_type, typename args_type,  \
-            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                 \
-  return_type                                                                  \
-  full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl::gmock_PerformImpl( \
-      GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
-
-}  // namespace internal
-
-// Similar to GMOCK_INTERNAL_ACTION, but no bound parameters are stored.
-#define ACTION(name)                                                          \
-  class name##Action {                                                        \
-   public:                                                                    \
-    explicit name##Action() noexcept {}                                       \
-    name##Action(const name##Action&) noexcept {}                             \
-    template <typename F>                                                     \
-    operator ::testing::Action<F>() const {                                   \
-      return ::testing::internal::MakeAction<F, gmock_Impl>();                \
-    }                                                                         \
-                                                                              \
-   private:                                                                   \
-    class gmock_Impl {                                                        \
-     public:                                                                  \
-      template <typename function_type, typename return_type,                 \
-                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>        \
-      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
-    };                                                                        \
-  };                                                                          \
-  inline name##Action name() GTEST_MUST_USE_RESULT_;                          \
-  inline name##Action name() { return name##Action(); }                       \
-  template <typename function_type, typename return_type, typename args_type, \
-            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
-  return_type name##Action::gmock_Impl::gmock_PerformImpl(                    \
-      GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
-
-#define ACTION_P(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP, (__VA_ARGS__))
-
-#define ACTION_P2(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP2, (__VA_ARGS__))
-
-#define ACTION_P3(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP3, (__VA_ARGS__))
-
-#define ACTION_P4(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP4, (__VA_ARGS__))
-
-#define ACTION_P5(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP5, (__VA_ARGS__))
-
-#define ACTION_P6(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP6, (__VA_ARGS__))
-
-#define ACTION_P7(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP7, (__VA_ARGS__))
-
-#define ACTION_P8(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP8, (__VA_ARGS__))
-
-#define ACTION_P9(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP9, (__VA_ARGS__))
-
-#define ACTION_P10(name, ...) \
-  GMOCK_INTERNAL_ACTION(name, name##ActionP10, (__VA_ARGS__))
-
-}  // namespace testing
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h b/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
deleted file mode 100644
index b6ab648e50..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements some commonly used cardinalities.  More
-// cardinalities can be defined by the user implementing the
-// CardinalityInterface interface if necessary.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
-
-#include <limits.h>
-
-#include <memory>
-#include <ostream>  // NOLINT
-
-#include "gmock/internal/gmock-port.h"
-#include "gtest/gtest.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// To implement a cardinality Foo, define:
-//   1. a class FooCardinality that implements the
-//      CardinalityInterface interface, and
-//   2. a factory function that creates a Cardinality object from a
-//      const FooCardinality*.
-//
-// The two-level delegation design follows that of Matcher, providing
-// consistency for extension developers.  It also eases ownership
-// management as Cardinality objects can now be copied like plain values.
-
-// The implementation of a cardinality.
-class CardinalityInterface {
- public:
-  virtual ~CardinalityInterface() {}
-
-  // Conservative estimate on the lower/upper bound of the number of
-  // calls allowed.
-  virtual int ConservativeLowerBound() const { return 0; }
-  virtual int ConservativeUpperBound() const { return INT_MAX; }
-
-  // Returns true if and only if call_count calls will satisfy this
-  // cardinality.
-  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
-
-  // Returns true if and only if call_count calls will saturate this
-  // cardinality.
-  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
-
-  // Describes self to an ostream.
-  virtual void DescribeTo(::std::ostream* os) const = 0;
-};
-
-// A Cardinality is a copyable and IMMUTABLE (except by assignment)
-// object that specifies how many times a mock function is expected to
-// be called.  The implementation of Cardinality is just a std::shared_ptr
-// to const CardinalityInterface. Don't inherit from Cardinality!
-class GTEST_API_ Cardinality {
- public:
-  // Constructs a null cardinality.  Needed for storing Cardinality
-  // objects in STL containers.
-  Cardinality() {}
-
-  // Constructs a Cardinality from its implementation.
-  explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
-
-  // Conservative estimate on the lower/upper bound of the number of
-  // calls allowed.
-  int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
-  int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
-
-  // Returns true if and only if call_count calls will satisfy this
-  // cardinality.
-  bool IsSatisfiedByCallCount(int call_count) const {
-    return impl_->IsSatisfiedByCallCount(call_count);
-  }
-
-  // Returns true if and only if call_count calls will saturate this
-  // cardinality.
-  bool IsSaturatedByCallCount(int call_count) const {
-    return impl_->IsSaturatedByCallCount(call_count);
-  }
-
-  // Returns true if and only if call_count calls will over-saturate this
-  // cardinality, i.e. exceed the maximum number of allowed calls.
-  bool IsOverSaturatedByCallCount(int call_count) const {
-    return impl_->IsSaturatedByCallCount(call_count) &&
-           !impl_->IsSatisfiedByCallCount(call_count);
-  }
-
-  // Describes self to an ostream
-  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
-
-  // Describes the given actual call count to an ostream.
-  static void DescribeActualCallCountTo(int actual_call_count,
-                                        ::std::ostream* os);
-
- private:
-  std::shared_ptr<const CardinalityInterface> impl_;
-};
-
-// Creates a cardinality that allows at least n calls.
-GTEST_API_ Cardinality AtLeast(int n);
-
-// Creates a cardinality that allows at most n calls.
-GTEST_API_ Cardinality AtMost(int n);
-
-// Creates a cardinality that allows any number of calls.
-GTEST_API_ Cardinality AnyNumber();
-
-// Creates a cardinality that allows between min and max calls.
-GTEST_API_ Cardinality Between(int min, int max);
-
-// Creates a cardinality that allows exactly n calls.
-GTEST_API_ Cardinality Exactly(int n);
-
-// Creates a cardinality from its implementation.
-inline Cardinality MakeCardinality(const CardinalityInterface* c) {
-  return Cardinality(c);
-}
-
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h b/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
deleted file mode 100644
index f565d980c5..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
+++ /dev/null
@@ -1,514 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements MOCK_METHOD.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
-
-#include <type_traits>  // IWYU pragma: keep
-#include <utility>      // IWYU pragma: keep
-
-#include "gmock/gmock-spec-builders.h"
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gmock/internal/gmock-pp.h"
-
-namespace testing {
-namespace internal {
-template <typename T>
-using identity_t = T;
-
-template <typename Pattern>
-struct ThisRefAdjuster {
-  template <typename T>
-  using AdjustT = typename std::conditional<
-      std::is_const<typename std::remove_reference<Pattern>::type>::value,
-      typename std::conditional<std::is_lvalue_reference<Pattern>::value,
-                                const T&, const T&&>::type,
-      typename std::conditional<std::is_lvalue_reference<Pattern>::value, T&,
-                                T&&>::type>::type;
-
-  template <typename MockType>
-  static AdjustT<MockType> Adjust(const MockType& mock) {
-    return static_cast<AdjustT<MockType>>(const_cast<MockType&>(mock));
-  }
-};
-
-constexpr bool PrefixOf(const char* a, const char* b) {
-  return *a == 0 || (*a == *b && internal::PrefixOf(a + 1, b + 1));
-}
-
-template <int N, int M>
-constexpr bool StartsWith(const char (&prefix)[N], const char (&str)[M]) {
-  return N <= M && internal::PrefixOf(prefix, str);
-}
-
-template <int N, int M>
-constexpr bool EndsWith(const char (&suffix)[N], const char (&str)[M]) {
-  return N <= M && internal::PrefixOf(suffix, str + M - N);
-}
-
-template <int N, int M>
-constexpr bool Equals(const char (&a)[N], const char (&b)[M]) {
-  return N == M && internal::PrefixOf(a, b);
-}
-
-template <int N>
-constexpr bool ValidateSpec(const char (&spec)[N]) {
-  return internal::Equals("const", spec) ||
-         internal::Equals("override", spec) ||
-         internal::Equals("final", spec) ||
-         internal::Equals("noexcept", spec) ||
-         (internal::StartsWith("noexcept(", spec) &&
-          internal::EndsWith(")", spec)) ||
-         internal::Equals("ref(&)", spec) ||
-         internal::Equals("ref(&&)", spec) ||
-         (internal::StartsWith("Calltype(", spec) &&
-          internal::EndsWith(")", spec));
-}
-
-}  // namespace internal
-
-// The style guide prohibits "using" statements in a namespace scope
-// inside a header file.  However, the FunctionMocker class template
-// is meant to be defined in the ::testing namespace.  The following
-// line is just a trick for working around a bug in MSVC 8.0, which
-// cannot handle it if we define FunctionMocker in ::testing.
-using internal::FunctionMocker;
-}  // namespace testing
-
-#define MOCK_METHOD(...) \
-  GMOCK_PP_VARIADIC_CALL(GMOCK_INTERNAL_MOCK_METHOD_ARG_, __VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_1(...) \
-  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_2(...) \
-  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
-  GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)  \
-  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                \
-  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                \
-  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                   \
-      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));        \
-  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                  \
-  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                         \
-      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),  \
-      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec), \
-      GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec),                             \
-      GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Spec),                             \
-      GMOCK_INTERNAL_GET_REF_SPEC(_Spec),                                  \
-      (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
-  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_6(...) \
-  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_ARG_7(...) \
-  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
-
-#define GMOCK_INTERNAL_WRONG_ARITY(...)                                      \
-  static_assert(                                                             \
-      false,                                                                 \
-      "MOCK_METHOD must be called with 3 or 4 arguments. _Ret, "             \
-      "_MethodName, _Args and optionally _Spec. _Args and _Spec must be "    \
-      "enclosed in parentheses. If _Ret is a type with unprotected commas, " \
-      "it must also be enclosed in parentheses.")
-
-#define GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Tuple) \
-  static_assert(                                  \
-      GMOCK_PP_IS_ENCLOSED_PARENS(_Tuple),        \
-      GMOCK_PP_STRINGIZE(_Tuple) " should be enclosed in parentheses.")
-
-#define GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(_N, ...)                 \
-  static_assert(                                                       \
-      std::is_function<__VA_ARGS__>::value,                            \
-      "Signature must be a function type, maybe return type contains " \
-      "unprotected comma.");                                           \
-  static_assert(                                                       \
-      ::testing::tuple_size<typename ::testing::internal::Function<    \
-              __VA_ARGS__>::ArgumentTuple>::value == _N,               \
-      "This method does not take " GMOCK_PP_STRINGIZE(                 \
-          _N) " arguments. Parenthesize all types with unprotected commas.")
-
-#define GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT, ~, _Spec)
-
-#define GMOCK_INTERNAL_MOCK_METHOD_IMPL(_N, _MethodName, _Constness,           \
-                                        _Override, _Final, _NoexceptSpec,      \
-                                        _CallType, _RefSpec, _Signature)       \
-  typename ::testing::internal::Function<GMOCK_PP_REMOVE_PARENS(               \
-      _Signature)>::Result                                                     \
-  GMOCK_INTERNAL_EXPAND(_CallType)                                             \
-      _MethodName(GMOCK_PP_REPEAT(GMOCK_INTERNAL_PARAMETER, _Signature, _N))   \
-          GMOCK_PP_IF(_Constness, const, ) _RefSpec _NoexceptSpec              \
-          GMOCK_PP_IF(_Override, override, ) GMOCK_PP_IF(_Final, final, ) {    \
-    GMOCK_MOCKER_(_N, _Constness, _MethodName)                                 \
-        .SetOwnerAndName(this, #_MethodName);                                  \
-    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
-        .Invoke(GMOCK_PP_REPEAT(GMOCK_INTERNAL_FORWARD_ARG, _Signature, _N));  \
-  }                                                                            \
-  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
-      GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_PARAMETER, _Signature, _N))       \
-      GMOCK_PP_IF(_Constness, const, ) _RefSpec {                              \
-    GMOCK_MOCKER_(_N, _Constness, _MethodName).RegisterOwner(this);            \
-    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
-        .With(GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_ARGUMENT, , _N));         \
-  }                                                                            \
-  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
-      const ::testing::internal::WithoutMatchers&,                             \
-      GMOCK_PP_IF(_Constness, const, )::testing::internal::Function<           \
-          GMOCK_PP_REMOVE_PARENS(_Signature)>*) const _RefSpec _NoexceptSpec { \
-    return ::testing::internal::ThisRefAdjuster<GMOCK_PP_IF(                   \
-        _Constness, const, ) int _RefSpec>::Adjust(*this)                      \
-        .gmock_##_MethodName(GMOCK_PP_REPEAT(                                  \
-            GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N));               \
-  }                                                                            \
-  mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)>        \
-  GMOCK_MOCKER_(_N, _Constness, _MethodName)
-
-#define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
-
-// Valid modifiers.
-#define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
-  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
-
-#define GMOCK_INTERNAL_HAS_OVERRIDE(_Tuple) \
-  GMOCK_PP_HAS_COMMA(                       \
-      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_OVERRIDE, ~, _Tuple))
-
-#define GMOCK_INTERNAL_HAS_FINAL(_Tuple) \
-  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_FINAL, ~, _Tuple))
-
-#define GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Tuple) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT, ~, _Tuple)
-
-#define GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT(_i, _, _elem)          \
-  GMOCK_PP_IF(                                                          \
-      GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)), \
-      _elem, )
-
-#define GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Tuple) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE, ~, _Tuple)
-
-#define GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE(_i, _, _elem)          \
-  GMOCK_PP_IF(                                                          \
-      GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem)), \
-      GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
-
-#define GMOCK_INTERNAL_GET_REF_SPEC(_Tuple) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_REF_SPEC_IF_REF, ~, _Tuple)
-
-#define GMOCK_INTERNAL_REF_SPEC_IF_REF(_i, _, _elem)                       \
-  GMOCK_PP_IF(GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)), \
-              GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
-
-#ifdef GMOCK_INTERNAL_STRICT_SPEC_ASSERT
-#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem) \
-  static_assert(                                                     \
-      ::testing::internal::ValidateSpec(GMOCK_PP_STRINGIZE(_elem)),  \
-      "Token \'" GMOCK_PP_STRINGIZE(                                 \
-          _elem) "\' cannot be recognized as a valid specification " \
-                 "modifier. Is a ',' missing?");
-#else
-#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)                 \
-  static_assert(                                                               \
-      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +         \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) +      \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +         \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) +      \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)) +           \
-       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem))) == 1, \
-      GMOCK_PP_STRINGIZE(                                                      \
-          _elem) " cannot be recognized as a valid specification modifier.");
-#endif  // GMOCK_INTERNAL_STRICT_SPEC_ASSERT
-
-// Modifiers implementation.
-#define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CONST_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_CONST_I_const ,
-
-#define GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_OVERRIDE_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_OVERRIDE_I_override ,
-
-#define GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_FINAL_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_FINAL_I_final ,
-
-#define GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_NOEXCEPT_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_NOEXCEPT_I_noexcept ,
-
-#define GMOCK_INTERNAL_DETECT_REF(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_REF_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_REF_I_ref ,
-
-#define GMOCK_INTERNAL_UNPACK_ref(x) x
-
-#define GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem) \
-  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CALLTYPE_I_, _elem)
-
-#define GMOCK_INTERNAL_DETECT_CALLTYPE_I_Calltype ,
-
-#define GMOCK_INTERNAL_UNPACK_Calltype(...) __VA_ARGS__
-
-// Note: The use of `identity_t` here allows _Ret to represent return types that
-// would normally need to be specified in a different way. For example, a method
-// returning a function pointer must be written as
-//
-// fn_ptr_return_t (*method(method_args_t...))(fn_ptr_args_t...)
-//
-// But we only support placing the return type at the beginning. To handle this,
-// we wrap all calls in identity_t, so that a declaration will be expanded to
-//
-// identity_t<fn_ptr_return_t (*)(fn_ptr_args_t...)> method(method_args_t...)
-//
-// This allows us to work around the syntactic oddities of function/method
-// types.
-#define GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)                                 \
-  ::testing::internal::identity_t<GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_Ret), \
-                                              GMOCK_PP_REMOVE_PARENS,         \
-                                              GMOCK_PP_IDENTITY)(_Ret)>(      \
-      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_TYPE, _, _Args))
-
-#define GMOCK_INTERNAL_GET_TYPE(_i, _, _elem)                          \
-  GMOCK_PP_COMMA_IF(_i)                                                \
-  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_elem), GMOCK_PP_REMOVE_PARENS, \
-              GMOCK_PP_IDENTITY)                                       \
-  (_elem)
-
-#define GMOCK_INTERNAL_PARAMETER(_i, _Signature, _)            \
-  GMOCK_PP_COMMA_IF(_i)                                        \
-  GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
-  gmock_a##_i
-
-#define GMOCK_INTERNAL_FORWARD_ARG(_i, _Signature, _) \
-  GMOCK_PP_COMMA_IF(_i)                               \
-  ::std::forward<GMOCK_INTERNAL_ARG_O(                \
-      _i, GMOCK_PP_REMOVE_PARENS(_Signature))>(gmock_a##_i)
-
-#define GMOCK_INTERNAL_MATCHER_PARAMETER(_i, _Signature, _)        \
-  GMOCK_PP_COMMA_IF(_i)                                            \
-  GMOCK_INTERNAL_MATCHER_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
-  gmock_a##_i
-
-#define GMOCK_INTERNAL_MATCHER_ARGUMENT(_i, _1, _2) \
-  GMOCK_PP_COMMA_IF(_i)                             \
-  gmock_a##_i
-
-#define GMOCK_INTERNAL_A_MATCHER_ARGUMENT(_i, _Signature, _) \
-  GMOCK_PP_COMMA_IF(_i)                                      \
-  ::testing::A<GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature))>()
-
-#define GMOCK_INTERNAL_ARG_O(_i, ...) \
-  typename ::testing::internal::Function<__VA_ARGS__>::template Arg<_i>::type
-
-#define GMOCK_INTERNAL_MATCHER_O(_i, ...)                          \
-  const ::testing::Matcher<typename ::testing::internal::Function< \
-      __VA_ARGS__>::template Arg<_i>::type>&
-
-#define MOCK_METHOD0(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 0, __VA_ARGS__)
-#define MOCK_METHOD1(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 1, __VA_ARGS__)
-#define MOCK_METHOD2(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 2, __VA_ARGS__)
-#define MOCK_METHOD3(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 3, __VA_ARGS__)
-#define MOCK_METHOD4(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 4, __VA_ARGS__)
-#define MOCK_METHOD5(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 5, __VA_ARGS__)
-#define MOCK_METHOD6(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 6, __VA_ARGS__)
-#define MOCK_METHOD7(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 7, __VA_ARGS__)
-#define MOCK_METHOD8(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 8, __VA_ARGS__)
-#define MOCK_METHOD9(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 9, __VA_ARGS__)
-#define MOCK_METHOD10(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, , m, 10, __VA_ARGS__)
-
-#define MOCK_CONST_METHOD0(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 0, __VA_ARGS__)
-#define MOCK_CONST_METHOD1(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 1, __VA_ARGS__)
-#define MOCK_CONST_METHOD2(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 2, __VA_ARGS__)
-#define MOCK_CONST_METHOD3(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 3, __VA_ARGS__)
-#define MOCK_CONST_METHOD4(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 4, __VA_ARGS__)
-#define MOCK_CONST_METHOD5(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 5, __VA_ARGS__)
-#define MOCK_CONST_METHOD6(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 6, __VA_ARGS__)
-#define MOCK_CONST_METHOD7(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 7, __VA_ARGS__)
-#define MOCK_CONST_METHOD8(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 8, __VA_ARGS__)
-#define MOCK_CONST_METHOD9(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 9, __VA_ARGS__)
-#define MOCK_CONST_METHOD10(m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 10, __VA_ARGS__)
-
-#define MOCK_METHOD0_T(m, ...) MOCK_METHOD0(m, __VA_ARGS__)
-#define MOCK_METHOD1_T(m, ...) MOCK_METHOD1(m, __VA_ARGS__)
-#define MOCK_METHOD2_T(m, ...) MOCK_METHOD2(m, __VA_ARGS__)
-#define MOCK_METHOD3_T(m, ...) MOCK_METHOD3(m, __VA_ARGS__)
-#define MOCK_METHOD4_T(m, ...) MOCK_METHOD4(m, __VA_ARGS__)
-#define MOCK_METHOD5_T(m, ...) MOCK_METHOD5(m, __VA_ARGS__)
-#define MOCK_METHOD6_T(m, ...) MOCK_METHOD6(m, __VA_ARGS__)
-#define MOCK_METHOD7_T(m, ...) MOCK_METHOD7(m, __VA_ARGS__)
-#define MOCK_METHOD8_T(m, ...) MOCK_METHOD8(m, __VA_ARGS__)
-#define MOCK_METHOD9_T(m, ...) MOCK_METHOD9(m, __VA_ARGS__)
-#define MOCK_METHOD10_T(m, ...) MOCK_METHOD10(m, __VA_ARGS__)
-
-#define MOCK_CONST_METHOD0_T(m, ...) MOCK_CONST_METHOD0(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD1_T(m, ...) MOCK_CONST_METHOD1(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD2_T(m, ...) MOCK_CONST_METHOD2(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD3_T(m, ...) MOCK_CONST_METHOD3(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD4_T(m, ...) MOCK_CONST_METHOD4(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD5_T(m, ...) MOCK_CONST_METHOD5(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD6_T(m, ...) MOCK_CONST_METHOD6(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD7_T(m, ...) MOCK_CONST_METHOD7(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD8_T(m, ...) MOCK_CONST_METHOD8(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD9_T(m, ...) MOCK_CONST_METHOD9(m, __VA_ARGS__)
-#define MOCK_CONST_METHOD10_T(m, ...) MOCK_CONST_METHOD10(m, __VA_ARGS__)
-
-#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 0, __VA_ARGS__)
-#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 1, __VA_ARGS__)
-#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 2, __VA_ARGS__)
-#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 3, __VA_ARGS__)
-#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 4, __VA_ARGS__)
-#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 5, __VA_ARGS__)
-#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 6, __VA_ARGS__)
-#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 7, __VA_ARGS__)
-#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 8, __VA_ARGS__)
-#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 9, __VA_ARGS__)
-#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 10, __VA_ARGS__)
-
-#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 0, __VA_ARGS__)
-#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 1, __VA_ARGS__)
-#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 2, __VA_ARGS__)
-#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 3, __VA_ARGS__)
-#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 4, __VA_ARGS__)
-#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 5, __VA_ARGS__)
-#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 6, __VA_ARGS__)
-#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 7, __VA_ARGS__)
-#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 8, __VA_ARGS__)
-#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 9, __VA_ARGS__)
-#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
-  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 10, __VA_ARGS__)
-
-#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-
-#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
-  MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
-
-#define GMOCK_INTERNAL_MOCK_METHODN(constness, ct, Method, args_num, ...) \
-  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                  \
-      args_num, ::testing::internal::identity_t<__VA_ARGS__>);            \
-  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                        \
-      args_num, Method, GMOCK_PP_NARG0(constness), 0, 0, , ct, ,          \
-      (::testing::internal::identity_t<__VA_ARGS__>))
-
-#define GMOCK_MOCKER_(arity, constness, Method) \
-  GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h b/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
deleted file mode 100644
index 6282901145..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
+++ /dev/null
@@ -1,5610 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// The MATCHER* family of macros can be used in a namespace scope to
-// define custom matchers easily.
-//
-// Basic Usage
-// ===========
-//
-// The syntax
-//
-//   MATCHER(name, description_string) { statements; }
-//
-// defines a matcher with the given name that executes the statements,
-// which must return a bool to indicate if the match succeeds.  Inside
-// the statements, you can refer to the value being matched by 'arg',
-// and refer to its type by 'arg_type'.
-//
-// The description string documents what the matcher does, and is used
-// to generate the failure message when the match fails.  Since a
-// MATCHER() is usually defined in a header file shared by multiple
-// C++ source files, we require the description to be a C-string
-// literal to avoid possible side effects.  It can be empty, in which
-// case we'll use the sequence of words in the matcher name as the
-// description.
-//
-// For example:
-//
-//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
-//
-// allows you to write
-//
-//   // Expects mock_foo.Bar(n) to be called where n is even.
-//   EXPECT_CALL(mock_foo, Bar(IsEven()));
-//
-// or,
-//
-//   // Verifies that the value of some_expression is even.
-//   EXPECT_THAT(some_expression, IsEven());
-//
-// If the above assertion fails, it will print something like:
-//
-//   Value of: some_expression
-//   Expected: is even
-//     Actual: 7
-//
-// where the description "is even" is automatically calculated from the
-// matcher name IsEven.
-//
-// Argument Type
-// =============
-//
-// Note that the type of the value being matched (arg_type) is
-// determined by the context in which you use the matcher and is
-// supplied to you by the compiler, so you don't need to worry about
-// declaring it (nor can you).  This allows the matcher to be
-// polymorphic.  For example, IsEven() can be used to match any type
-// where the value of "(arg % 2) == 0" can be implicitly converted to
-// a bool.  In the "Bar(IsEven())" example above, if method Bar()
-// takes an int, 'arg_type' will be int; if it takes an unsigned long,
-// 'arg_type' will be unsigned long; and so on.
-//
-// Parameterizing Matchers
-// =======================
-//
-// Sometimes you'll want to parameterize the matcher.  For that you
-// can use another macro:
-//
-//   MATCHER_P(name, param_name, description_string) { statements; }
-//
-// For example:
-//
-//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
-//
-// will allow you to write:
-//
-//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
-//
-// which may lead to this message (assuming n is 10):
-//
-//   Value of: Blah("a")
-//   Expected: has absolute value 10
-//     Actual: -9
-//
-// Note that both the matcher description and its parameter are
-// printed, making the message human-friendly.
-//
-// In the matcher definition body, you can write 'foo_type' to
-// reference the type of a parameter named 'foo'.  For example, in the
-// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
-// 'value_type' to refer to the type of 'value'.
-//
-// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P$n to
-// support multi-parameter matchers.
-//
-// Describing Parameterized Matchers
-// =================================
-//
-// The last argument to MATCHER*() is a string-typed expression.  The
-// expression can reference all of the matcher's parameters and a
-// special bool-typed variable named 'negation'.  When 'negation' is
-// false, the expression should evaluate to the matcher's description;
-// otherwise it should evaluate to the description of the negation of
-// the matcher.  For example,
-//
-//   using testing::PrintToString;
-//
-//   MATCHER_P2(InClosedRange, low, hi,
-//       std::string(negation ? "is not" : "is") + " in range [" +
-//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
-//     return low <= arg && arg <= hi;
-//   }
-//   ...
-//   EXPECT_THAT(3, InClosedRange(4, 6));
-//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
-//
-// would generate two failures that contain the text:
-//
-//   Expected: is in range [4, 6]
-//   ...
-//   Expected: is not in range [2, 4]
-//
-// If you specify "" as the description, the failure message will
-// contain the sequence of words in the matcher name followed by the
-// parameter values printed as a tuple.  For example,
-//
-//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
-//   ...
-//   EXPECT_THAT(3, InClosedRange(4, 6));
-//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
-//
-// would generate two failures that contain the text:
-//
-//   Expected: in closed range (4, 6)
-//   ...
-//   Expected: not (in closed range (2, 4))
-//
-// Types of Matcher Parameters
-// ===========================
-//
-// For the purpose of typing, you can view
-//
-//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
-//
-// as shorthand for
-//
-//   template <typename p1_type, ..., typename pk_type>
-//   FooMatcherPk<p1_type, ..., pk_type>
-//   Foo(p1_type p1, ..., pk_type pk) { ... }
-//
-// When you write Foo(v1, ..., vk), the compiler infers the types of
-// the parameters v1, ..., and vk for you.  If you are not happy with
-// the result of the type inference, you can specify the types by
-// explicitly instantiating the template, as in Foo<long, bool>(5,
-// false).  As said earlier, you don't get to (or need to) specify
-// 'arg_type' as that's determined by the context in which the matcher
-// is used.  You can assign the result of expression Foo(p1, ..., pk)
-// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
-// can be useful when composing matchers.
-//
-// While you can instantiate a matcher template with reference types,
-// passing the parameters by pointer usually makes your code more
-// readable.  If, however, you still want to pass a parameter by
-// reference, be aware that in the failure message generated by the
-// matcher you will see the value of the referenced object but not its
-// address.
-//
-// Explaining Match Results
-// ========================
-//
-// Sometimes the matcher description alone isn't enough to explain why
-// the match has failed or succeeded.  For example, when expecting a
-// long string, it can be very helpful to also print the diff between
-// the expected string and the actual one.  To achieve that, you can
-// optionally stream additional information to a special variable
-// named result_listener, whose type is a pointer to class
-// MatchResultListener:
-//
-//   MATCHER_P(EqualsLongString, str, "") {
-//     if (arg == str) return true;
-//
-//     *result_listener << "the difference: "
-///                     << DiffStrings(str, arg);
-//     return false;
-//   }
-//
-// Overloading Matchers
-// ====================
-//
-// You can overload matchers with different numbers of parameters:
-//
-//   MATCHER_P(Blah, a, description_string1) { ... }
-//   MATCHER_P2(Blah, a, b, description_string2) { ... }
-//
-// Caveats
-// =======
-//
-// When defining a new matcher, you should also consider implementing
-// MatcherInterface or using MakePolymorphicMatcher().  These
-// approaches require more work than the MATCHER* macros, but also
-// give you more control on the types of the value being matched and
-// the matcher parameters, which may leads to better compiler error
-// messages when the matcher is used wrong.  They also allow
-// overloading matchers based on parameter types (as opposed to just
-// based on the number of parameters).
-//
-// MATCHER*() can only be used in a namespace scope as templates cannot be
-// declared inside of a local class.
-//
-// More Information
-// ================
-//
-// To learn more about using these macros, please search for 'MATCHER'
-// on
-// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
-//
-// This file also implements some commonly used argument matchers.  More
-// matchers can be defined by the user implementing the
-// MatcherInterface<T> interface if necessary.
-//
-// See googletest/include/gtest/gtest-matchers.h for the definition of class
-// Matcher, class MatcherInterface, and others.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
-
-#include <algorithm>
-#include <cmath>
-#include <initializer_list>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gmock/internal/gmock-port.h"
-#include "gmock/internal/gmock-pp.h"
-#include "gtest/gtest.h"
-
-// MSVC warning C5046 is new as of VS2017 version 15.8.
-#if defined(_MSC_VER) && _MSC_VER >= 1915
-#define GMOCK_MAYBE_5046_ 5046
-#else
-#define GMOCK_MAYBE_5046_
-#endif
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(
-    4251 GMOCK_MAYBE_5046_ /* class A needs to have dll-interface to be used by
-                              clients of class B */
-    /* Symbol involving type with internal linkage not defined */)
-
-namespace testing {
-
-// To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
-//   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
-
-// A match result listener that stores the explanation in a string.
-class StringMatchResultListener : public MatchResultListener {
- public:
-  StringMatchResultListener() : MatchResultListener(&ss_) {}
-
-  // Returns the explanation accumulated so far.
-  std::string str() const { return ss_.str(); }
-
-  // Clears the explanation accumulated so far.
-  void Clear() { ss_.str(""); }
-
- private:
-  ::std::stringstream ss_;
-
-  StringMatchResultListener(const StringMatchResultListener&) = delete;
-  StringMatchResultListener& operator=(const StringMatchResultListener&) =
-      delete;
-};
-
-// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
-// and MUST NOT BE USED IN USER CODE!!!
-namespace internal {
-
-// The MatcherCastImpl class template is a helper for implementing
-// MatcherCast().  We need this helper in order to partially
-// specialize the implementation of MatcherCast() (C++ allows
-// class/struct templates to be partially specialized, but not
-// function templates.).
-
-// This general version is used when MatcherCast()'s argument is a
-// polymorphic matcher (i.e. something that can be converted to a
-// Matcher but is not one yet; for example, Eq(value)) or a value (for
-// example, "hello").
-template <typename T, typename M>
-class MatcherCastImpl {
- public:
-  static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
-    // M can be a polymorphic matcher, in which case we want to use
-    // its conversion operator to create Matcher<T>.  Or it can be a value
-    // that should be passed to the Matcher<T>'s constructor.
-    //
-    // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
-    // polymorphic matcher because it'll be ambiguous if T has an implicit
-    // constructor from M (this usually happens when T has an implicit
-    // constructor from any type).
-    //
-    // It won't work to unconditionally implicit_cast
-    // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
-    // a user-defined conversion from M to T if one exists (assuming M is
-    // a value).
-    return CastImpl(polymorphic_matcher_or_value,
-                    std::is_convertible<M, Matcher<T>>{},
-                    std::is_convertible<M, T>{});
-  }
-
- private:
-  template <bool Ignore>
-  static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
-                             std::true_type /* convertible_to_matcher */,
-                             std::integral_constant<bool, Ignore>) {
-    // M is implicitly convertible to Matcher<T>, which means that either
-    // M is a polymorphic matcher or Matcher<T> has an implicit constructor
-    // from M.  In both cases using the implicit conversion will produce a
-    // matcher.
-    //
-    // Even if T has an implicit constructor from M, it won't be called because
-    // creating Matcher<T> would require a chain of two user-defined conversions
-    // (first to create T from M and then to create Matcher<T> from T).
-    return polymorphic_matcher_or_value;
-  }
-
-  // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
-  // matcher. It's a value of a type implicitly convertible to T. Use direct
-  // initialization to create a matcher.
-  static Matcher<T> CastImpl(const M& value,
-                             std::false_type /* convertible_to_matcher */,
-                             std::true_type /* convertible_to_T */) {
-    return Matcher<T>(ImplicitCast_<T>(value));
-  }
-
-  // M can't be implicitly converted to either Matcher<T> or T. Attempt to use
-  // polymorphic matcher Eq(value) in this case.
-  //
-  // Note that we first attempt to perform an implicit cast on the value and
-  // only fall back to the polymorphic Eq() matcher afterwards because the
-  // latter calls bool operator==(const Lhs& lhs, const Rhs& rhs) in the end
-  // which might be undefined even when Rhs is implicitly convertible to Lhs
-  // (e.g. std::pair<const int, int> vs. std::pair<int, int>).
-  //
-  // We don't define this method inline as we need the declaration of Eq().
-  static Matcher<T> CastImpl(const M& value,
-                             std::false_type /* convertible_to_matcher */,
-                             std::false_type /* convertible_to_T */);
-};
-
-// This more specialized version is used when MatcherCast()'s argument
-// is already a Matcher.  This only compiles when type T can be
-// statically converted to type U.
-template <typename T, typename U>
-class MatcherCastImpl<T, Matcher<U>> {
- public:
-  static Matcher<T> Cast(const Matcher<U>& source_matcher) {
-    return Matcher<T>(new Impl(source_matcher));
-  }
-
- private:
-  class Impl : public MatcherInterface<T> {
-   public:
-    explicit Impl(const Matcher<U>& source_matcher)
-        : source_matcher_(source_matcher) {}
-
-    // We delegate the matching logic to the source matcher.
-    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
-      using FromType = typename std::remove_cv<typename std::remove_pointer<
-          typename std::remove_reference<T>::type>::type>::type;
-      using ToType = typename std::remove_cv<typename std::remove_pointer<
-          typename std::remove_reference<U>::type>::type>::type;
-      // Do not allow implicitly converting base*/& to derived*/&.
-      static_assert(
-          // Do not trigger if only one of them is a pointer. That implies a
-          // regular conversion and not a down_cast.
-          (std::is_pointer<typename std::remove_reference<T>::type>::value !=
-           std::is_pointer<typename std::remove_reference<U>::type>::value) ||
-              std::is_same<FromType, ToType>::value ||
-              !std::is_base_of<FromType, ToType>::value,
-          "Can't implicitly convert from <base> to <derived>");
-
-      // Do the cast to `U` explicitly if necessary.
-      // Otherwise, let implicit conversions do the trick.
-      using CastType =
-          typename std::conditional<std::is_convertible<T&, const U&>::value,
-                                    T&, U>::type;
-
-      return source_matcher_.MatchAndExplain(static_cast<CastType>(x),
-                                             listener);
-    }
-
-    void DescribeTo(::std::ostream* os) const override {
-      source_matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      source_matcher_.DescribeNegationTo(os);
-    }
-
-   private:
-    const Matcher<U> source_matcher_;
-  };
-};
-
-// This even more specialized version is used for efficiently casting
-// a matcher to its own type.
-template <typename T>
-class MatcherCastImpl<T, Matcher<T>> {
- public:
-  static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
-};
-
-// Template specialization for parameterless Matcher.
-template <typename Derived>
-class MatcherBaseImpl {
- public:
-  MatcherBaseImpl() = default;
-
-  template <typename T>
-  operator ::testing::Matcher<T>() const {  // NOLINT(runtime/explicit)
-    return ::testing::Matcher<T>(new
-                                 typename Derived::template gmock_Impl<T>());
-  }
-};
-
-// Template specialization for Matcher with parameters.
-template <template <typename...> class Derived, typename... Ts>
-class MatcherBaseImpl<Derived<Ts...>> {
- public:
-  // Mark the constructor explicit for single argument T to avoid implicit
-  // conversions.
-  template <typename E = std::enable_if<sizeof...(Ts) == 1>,
-            typename E::type* = nullptr>
-  explicit MatcherBaseImpl(Ts... params)
-      : params_(std::forward<Ts>(params)...) {}
-  template <typename E = std::enable_if<sizeof...(Ts) != 1>,
-            typename = typename E::type>
-  MatcherBaseImpl(Ts... params)  // NOLINT
-      : params_(std::forward<Ts>(params)...) {}
-
-  template <typename F>
-  operator ::testing::Matcher<F>() const {  // NOLINT(runtime/explicit)
-    return Apply<F>(MakeIndexSequence<sizeof...(Ts)>{});
-  }
-
- private:
-  template <typename F, std::size_t... tuple_ids>
-  ::testing::Matcher<F> Apply(IndexSequence<tuple_ids...>) const {
-    return ::testing::Matcher<F>(
-        new typename Derived<Ts...>::template gmock_Impl<F>(
-            std::get<tuple_ids>(params_)...));
-  }
-
-  const std::tuple<Ts...> params_;
-};
-
-}  // namespace internal
-
-// In order to be safe and clear, casting between different matcher
-// types is done explicitly via MatcherCast<T>(m), which takes a
-// matcher m and returns a Matcher<T>.  It compiles only when T can be
-// statically converted to the argument type of m.
-template <typename T, typename M>
-inline Matcher<T> MatcherCast(const M& matcher) {
-  return internal::MatcherCastImpl<T, M>::Cast(matcher);
-}
-
-// This overload handles polymorphic matchers and values only since
-// monomorphic matchers are handled by the next one.
-template <typename T, typename M>
-inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher_or_value) {
-  return MatcherCast<T>(polymorphic_matcher_or_value);
-}
-
-// This overload handles monomorphic matchers.
-//
-// In general, if type T can be implicitly converted to type U, we can
-// safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
-// contravariant): just keep a copy of the original Matcher<U>, convert the
-// argument from type T to U, and then pass it to the underlying Matcher<U>.
-// The only exception is when U is a reference and T is not, as the
-// underlying Matcher<U> may be interested in the argument's address, which
-// is not preserved in the conversion from T to U.
-template <typename T, typename U>
-inline Matcher<T> SafeMatcherCast(const Matcher<U>& matcher) {
-  // Enforce that T can be implicitly converted to U.
-  static_assert(std::is_convertible<const T&, const U&>::value,
-                "T must be implicitly convertible to U");
-  // Enforce that we are not converting a non-reference type T to a reference
-  // type U.
-  static_assert(std::is_reference<T>::value || !std::is_reference<U>::value,
-                "cannot convert non reference arg to reference");
-  // In case both T and U are arithmetic types, enforce that the
-  // conversion is not lossy.
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
-  constexpr bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
-  constexpr bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
-  static_assert(
-      kTIsOther || kUIsOther ||
-          (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
-      "conversion of arithmetic types must be lossless");
-  return MatcherCast<T>(matcher);
-}
-
-// A<T>() returns a matcher that matches any value of type T.
-template <typename T>
-Matcher<T> A();
-
-// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
-// and MUST NOT BE USED IN USER CODE!!!
-namespace internal {
-
-// If the explanation is not empty, prints it to the ostream.
-inline void PrintIfNotEmpty(const std::string& explanation,
-                            ::std::ostream* os) {
-  if (explanation != "" && os != nullptr) {
-    *os << ", " << explanation;
-  }
-}
-
-// Returns true if the given type name is easy to read by a human.
-// This is used to decide whether printing the type of a value might
-// be helpful.
-inline bool IsReadableTypeName(const std::string& type_name) {
-  // We consider a type name readable if it's short or doesn't contain
-  // a template or function type.
-  return (type_name.length() <= 20 ||
-          type_name.find_first_of("<(") == std::string::npos);
-}
-
-// Matches the value against the given matcher, prints the value and explains
-// the match result to the listener. Returns the match result.
-// 'listener' must not be NULL.
-// Value cannot be passed by const reference, because some matchers take a
-// non-const argument.
-template <typename Value, typename T>
-bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
-                          MatchResultListener* listener) {
-  if (!listener->IsInterested()) {
-    // If the listener is not interested, we do not need to construct the
-    // inner explanation.
-    return matcher.Matches(value);
-  }
-
-  StringMatchResultListener inner_listener;
-  const bool match = matcher.MatchAndExplain(value, &inner_listener);
-
-  UniversalPrint(value, listener->stream());
-#if GTEST_HAS_RTTI
-  const std::string& type_name = GetTypeName<Value>();
-  if (IsReadableTypeName(type_name))
-    *listener->stream() << " (of type " << type_name << ")";
-#endif
-  PrintIfNotEmpty(inner_listener.str(), listener->stream());
-
-  return match;
-}
-
-// An internal helper class for doing compile-time loop on a tuple's
-// fields.
-template <size_t N>
-class TuplePrefix {
- public:
-  // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
-  // if and only if the first N fields of matcher_tuple matches
-  // the first N fields of value_tuple, respectively.
-  template <typename MatcherTuple, typename ValueTuple>
-  static bool Matches(const MatcherTuple& matcher_tuple,
-                      const ValueTuple& value_tuple) {
-    return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple) &&
-           std::get<N - 1>(matcher_tuple).Matches(std::get<N - 1>(value_tuple));
-  }
-
-  // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
-  // describes failures in matching the first N fields of matchers
-  // against the first N fields of values.  If there is no failure,
-  // nothing will be streamed to os.
-  template <typename MatcherTuple, typename ValueTuple>
-  static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
-                                     const ValueTuple& values,
-                                     ::std::ostream* os) {
-    // First, describes failures in the first N - 1 fields.
-    TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
-
-    // Then describes the failure (if any) in the (N - 1)-th (0-based)
-    // field.
-    typename std::tuple_element<N - 1, MatcherTuple>::type matcher =
-        std::get<N - 1>(matchers);
-    typedef typename std::tuple_element<N - 1, ValueTuple>::type Value;
-    const Value& value = std::get<N - 1>(values);
-    StringMatchResultListener listener;
-    if (!matcher.MatchAndExplain(value, &listener)) {
-      *os << "  Expected arg #" << N - 1 << ": ";
-      std::get<N - 1>(matchers).DescribeTo(os);
-      *os << "\n           Actual: ";
-      // We remove the reference in type Value to prevent the
-      // universal printer from printing the address of value, which
-      // isn't interesting to the user most of the time.  The
-      // matcher's MatchAndExplain() method handles the case when
-      // the address is interesting.
-      internal::UniversalPrint(value, os);
-      PrintIfNotEmpty(listener.str(), os);
-      *os << "\n";
-    }
-  }
-};
-
-// The base case.
-template <>
-class TuplePrefix<0> {
- public:
-  template <typename MatcherTuple, typename ValueTuple>
-  static bool Matches(const MatcherTuple& /* matcher_tuple */,
-                      const ValueTuple& /* value_tuple */) {
-    return true;
-  }
-
-  template <typename MatcherTuple, typename ValueTuple>
-  static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
-                                     const ValueTuple& /* values */,
-                                     ::std::ostream* /* os */) {}
-};
-
-// TupleMatches(matcher_tuple, value_tuple) returns true if and only if
-// all matchers in matcher_tuple match the corresponding fields in
-// value_tuple.  It is a compiler error if matcher_tuple and
-// value_tuple have different number of fields or incompatible field
-// types.
-template <typename MatcherTuple, typename ValueTuple>
-bool TupleMatches(const MatcherTuple& matcher_tuple,
-                  const ValueTuple& value_tuple) {
-  // Makes sure that matcher_tuple and value_tuple have the same
-  // number of fields.
-  static_assert(std::tuple_size<MatcherTuple>::value ==
-                    std::tuple_size<ValueTuple>::value,
-                "matcher and value have different numbers of fields");
-  return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
-                                                                  value_tuple);
-}
-
-// Describes failures in matching matchers against values.  If there
-// is no failure, nothing will be streamed to os.
-template <typename MatcherTuple, typename ValueTuple>
-void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
-                                const ValueTuple& values, ::std::ostream* os) {
-  TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
-      matchers, values, os);
-}
-
-// TransformTupleValues and its helper.
-//
-// TransformTupleValuesHelper hides the internal machinery that
-// TransformTupleValues uses to implement a tuple traversal.
-template <typename Tuple, typename Func, typename OutIter>
-class TransformTupleValuesHelper {
- private:
-  typedef ::std::tuple_size<Tuple> TupleSize;
-
- public:
-  // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
-  // Returns the final value of 'out' in case the caller needs it.
-  static OutIter Run(Func f, const Tuple& t, OutIter out) {
-    return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
-  }
-
- private:
-  template <typename Tup, size_t kRemainingSize>
-  struct IterateOverTuple {
-    OutIter operator()(Func f, const Tup& t, OutIter out) const {
-      *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
-      return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
-    }
-  };
-  template <typename Tup>
-  struct IterateOverTuple<Tup, 0> {
-    OutIter operator()(Func /* f */, const Tup& /* t */, OutIter out) const {
-      return out;
-    }
-  };
-};
-
-// Successively invokes 'f(element)' on each element of the tuple 't',
-// appending each result to the 'out' iterator. Returns the final value
-// of 'out'.
-template <typename Tuple, typename Func, typename OutIter>
-OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
-  return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
-}
-
-// Implements _, a matcher that matches any value of any
-// type.  This is a polymorphic matcher, so we need a template type
-// conversion operator to make it appearing as a Matcher<T> for any
-// type T.
-class AnythingMatcher {
- public:
-  using is_gtest_matcher = void;
-
-  template <typename T>
-  bool MatchAndExplain(const T& /* x */, std::ostream* /* listener */) const {
-    return true;
-  }
-  void DescribeTo(std::ostream* os) const { *os << "is anything"; }
-  void DescribeNegationTo(::std::ostream* os) const {
-    // This is mostly for completeness' sake, as it's not very useful
-    // to write Not(A<bool>()).  However we cannot completely rule out
-    // such a possibility, and it doesn't hurt to be prepared.
-    *os << "never matches";
-  }
-};
-
-// Implements the polymorphic IsNull() matcher, which matches any raw or smart
-// pointer that is NULL.
-class IsNullMatcher {
- public:
-  template <typename Pointer>
-  bool MatchAndExplain(const Pointer& p,
-                       MatchResultListener* /* listener */) const {
-    return p == nullptr;
-  }
-
-  void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
-  void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NULL"; }
-};
-
-// Implements the polymorphic NotNull() matcher, which matches any raw or smart
-// pointer that is not NULL.
-class NotNullMatcher {
- public:
-  template <typename Pointer>
-  bool MatchAndExplain(const Pointer& p,
-                       MatchResultListener* /* listener */) const {
-    return p != nullptr;
-  }
-
-  void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
-  void DescribeNegationTo(::std::ostream* os) const { *os << "is NULL"; }
-};
-
-// Ref(variable) matches any argument that is a reference to
-// 'variable'.  This matcher is polymorphic as it can match any
-// super type of the type of 'variable'.
-//
-// The RefMatcher template class implements Ref(variable).  It can
-// only be instantiated with a reference type.  This prevents a user
-// from mistakenly using Ref(x) to match a non-reference function
-// argument.  For example, the following will righteously cause a
-// compiler error:
-//
-//   int n;
-//   Matcher<int> m1 = Ref(n);   // This won't compile.
-//   Matcher<int&> m2 = Ref(n);  // This will compile.
-template <typename T>
-class RefMatcher;
-
-template <typename T>
-class RefMatcher<T&> {
-  // Google Mock is a generic framework and thus needs to support
-  // mocking any function types, including those that take non-const
-  // reference arguments.  Therefore the template parameter T (and
-  // Super below) can be instantiated to either a const type or a
-  // non-const type.
- public:
-  // RefMatcher() takes a T& instead of const T&, as we want the
-  // compiler to catch using Ref(const_value) as a matcher for a
-  // non-const reference.
-  explicit RefMatcher(T& x) : object_(x) {}  // NOLINT
-
-  template <typename Super>
-  operator Matcher<Super&>() const {
-    // By passing object_ (type T&) to Impl(), which expects a Super&,
-    // we make sure that Super is a super type of T.  In particular,
-    // this catches using Ref(const_value) as a matcher for a
-    // non-const reference, as you cannot implicitly convert a const
-    // reference to a non-const reference.
-    return MakeMatcher(new Impl<Super>(object_));
-  }
-
- private:
-  template <typename Super>
-  class Impl : public MatcherInterface<Super&> {
-   public:
-    explicit Impl(Super& x) : object_(x) {}  // NOLINT
-
-    // MatchAndExplain() takes a Super& (as opposed to const Super&)
-    // in order to match the interface MatcherInterface<Super&>.
-    bool MatchAndExplain(Super& x,
-                         MatchResultListener* listener) const override {
-      *listener << "which is located @" << static_cast<const void*>(&x);
-      return &x == &object_;
-    }
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "references the variable ";
-      UniversalPrinter<Super&>::Print(object_, os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "does not reference the variable ";
-      UniversalPrinter<Super&>::Print(object_, os);
-    }
-
-   private:
-    const Super& object_;
-  };
-
-  T& object_;
-};
-
-// Polymorphic helper functions for narrow and wide string matchers.
-inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
-  return String::CaseInsensitiveCStringEquals(lhs, rhs);
-}
-
-inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
-                                         const wchar_t* rhs) {
-  return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
-}
-
-// String comparison for narrow or wide strings that can have embedded NUL
-// characters.
-template <typename StringType>
-bool CaseInsensitiveStringEquals(const StringType& s1, const StringType& s2) {
-  // Are the heads equal?
-  if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
-    return false;
-  }
-
-  // Skip the equal heads.
-  const typename StringType::value_type nul = 0;
-  const size_t i1 = s1.find(nul), i2 = s2.find(nul);
-
-  // Are we at the end of either s1 or s2?
-  if (i1 == StringType::npos || i2 == StringType::npos) {
-    return i1 == i2;
-  }
-
-  // Are the tails equal?
-  return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
-}
-
-// String matchers.
-
-// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
-template <typename StringType>
-class StrEqualityMatcher {
- public:
-  StrEqualityMatcher(StringType str, bool expect_eq, bool case_sensitive)
-      : string_(std::move(str)),
-        expect_eq_(expect_eq),
-        case_sensitive_(case_sensitive) {}
-
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  bool MatchAndExplain(const internal::StringView& s,
-                       MatchResultListener* listener) const {
-    // This should fail to compile if StringView is used with wide
-    // strings.
-    const StringType& str = std::string(s);
-    return MatchAndExplain(str, listener);
-  }
-#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
-
-  // Accepts pointer types, particularly:
-  //   const char*
-  //   char*
-  //   const wchar_t*
-  //   wchar_t*
-  template <typename CharType>
-  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
-    if (s == nullptr) {
-      return !expect_eq_;
-    }
-    return MatchAndExplain(StringType(s), listener);
-  }
-
-  // Matches anything that can convert to StringType.
-  //
-  // This is a template, not just a plain function with const StringType&,
-  // because StringView has some interfering non-explicit constructors.
-  template <typename MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* /* listener */) const {
-    const StringType s2(s);
-    const bool eq = case_sensitive_ ? s2 == string_
-                                    : CaseInsensitiveStringEquals(s2, string_);
-    return expect_eq_ == eq;
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    DescribeToHelper(expect_eq_, os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    DescribeToHelper(!expect_eq_, os);
-  }
-
- private:
-  void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
-    *os << (expect_eq ? "is " : "isn't ");
-    *os << "equal to ";
-    if (!case_sensitive_) {
-      *os << "(ignoring case) ";
-    }
-    UniversalPrint(string_, os);
-  }
-
-  const StringType string_;
-  const bool expect_eq_;
-  const bool case_sensitive_;
-};
-
-// Implements the polymorphic HasSubstr(substring) matcher, which
-// can be used as a Matcher<T> as long as T can be converted to a
-// string.
-template <typename StringType>
-class HasSubstrMatcher {
- public:
-  explicit HasSubstrMatcher(const StringType& substring)
-      : substring_(substring) {}
-
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  bool MatchAndExplain(const internal::StringView& s,
-                       MatchResultListener* listener) const {
-    // This should fail to compile if StringView is used with wide
-    // strings.
-    const StringType& str = std::string(s);
-    return MatchAndExplain(str, listener);
-  }
-#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
-
-  // Accepts pointer types, particularly:
-  //   const char*
-  //   char*
-  //   const wchar_t*
-  //   wchar_t*
-  template <typename CharType>
-  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
-    return s != nullptr && MatchAndExplain(StringType(s), listener);
-  }
-
-  // Matches anything that can convert to StringType.
-  //
-  // This is a template, not just a plain function with const StringType&,
-  // because StringView has some interfering non-explicit constructors.
-  template <typename MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* /* listener */) const {
-    return StringType(s).find(substring_) != StringType::npos;
-  }
-
-  // Describes what this matcher matches.
-  void DescribeTo(::std::ostream* os) const {
-    *os << "has substring ";
-    UniversalPrint(substring_, os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "has no substring ";
-    UniversalPrint(substring_, os);
-  }
-
- private:
-  const StringType substring_;
-};
-
-// Implements the polymorphic StartsWith(substring) matcher, which
-// can be used as a Matcher<T> as long as T can be converted to a
-// string.
-template <typename StringType>
-class StartsWithMatcher {
- public:
-  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {}
-
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  bool MatchAndExplain(const internal::StringView& s,
-                       MatchResultListener* listener) const {
-    // This should fail to compile if StringView is used with wide
-    // strings.
-    const StringType& str = std::string(s);
-    return MatchAndExplain(str, listener);
-  }
-#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
-
-  // Accepts pointer types, particularly:
-  //   const char*
-  //   char*
-  //   const wchar_t*
-  //   wchar_t*
-  template <typename CharType>
-  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
-    return s != nullptr && MatchAndExplain(StringType(s), listener);
-  }
-
-  // Matches anything that can convert to StringType.
-  //
-  // This is a template, not just a plain function with const StringType&,
-  // because StringView has some interfering non-explicit constructors.
-  template <typename MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* /* listener */) const {
-    const StringType& s2(s);
-    return s2.length() >= prefix_.length() &&
-           s2.substr(0, prefix_.length()) == prefix_;
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "starts with ";
-    UniversalPrint(prefix_, os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "doesn't start with ";
-    UniversalPrint(prefix_, os);
-  }
-
- private:
-  const StringType prefix_;
-};
-
-// Implements the polymorphic EndsWith(substring) matcher, which
-// can be used as a Matcher<T> as long as T can be converted to a
-// string.
-template <typename StringType>
-class EndsWithMatcher {
- public:
-  explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
-
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  bool MatchAndExplain(const internal::StringView& s,
-                       MatchResultListener* listener) const {
-    // This should fail to compile if StringView is used with wide
-    // strings.
-    const StringType& str = std::string(s);
-    return MatchAndExplain(str, listener);
-  }
-#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
-
-  // Accepts pointer types, particularly:
-  //   const char*
-  //   char*
-  //   const wchar_t*
-  //   wchar_t*
-  template <typename CharType>
-  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
-    return s != nullptr && MatchAndExplain(StringType(s), listener);
-  }
-
-  // Matches anything that can convert to StringType.
-  //
-  // This is a template, not just a plain function with const StringType&,
-  // because StringView has some interfering non-explicit constructors.
-  template <typename MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* /* listener */) const {
-    const StringType& s2(s);
-    return s2.length() >= suffix_.length() &&
-           s2.substr(s2.length() - suffix_.length()) == suffix_;
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "ends with ";
-    UniversalPrint(suffix_, os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "doesn't end with ";
-    UniversalPrint(suffix_, os);
-  }
-
- private:
-  const StringType suffix_;
-};
-
-// Implements the polymorphic WhenBase64Unescaped(matcher) matcher, which can be
-// used as a Matcher<T> as long as T can be converted to a string.
-class WhenBase64UnescapedMatcher {
- public:
-  using is_gtest_matcher = void;
-
-  explicit WhenBase64UnescapedMatcher(
-      const Matcher<const std::string&>& internal_matcher)
-      : internal_matcher_(internal_matcher) {}
-
-  // Matches anything that can convert to std::string.
-  template <typename MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* listener) const {
-    const std::string s2(s);  // NOLINT (needed for working with string_view).
-    std::string unescaped;
-    if (!internal::Base64Unescape(s2, &unescaped)) {
-      if (listener != nullptr) {
-        *listener << "is not a valid base64 escaped string";
-      }
-      return false;
-    }
-    return MatchPrintAndExplain(unescaped, internal_matcher_, listener);
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "matches after Base64Unescape ";
-    internal_matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "does not match after Base64Unescape ";
-    internal_matcher_.DescribeTo(os);
-  }
-
- private:
-  const Matcher<const std::string&> internal_matcher_;
-};
-
-// Implements a matcher that compares the two fields of a 2-tuple
-// using one of the ==, <=, <, etc, operators.  The two fields being
-// compared don't have to have the same type.
-//
-// The matcher defined here is polymorphic (for example, Eq() can be
-// used to match a std::tuple<int, short>, a std::tuple<const long&, double>,
-// etc).  Therefore we use a template type conversion operator in the
-// implementation.
-template <typename D, typename Op>
-class PairMatchBase {
- public:
-  template <typename T1, typename T2>
-  operator Matcher<::std::tuple<T1, T2>>() const {
-    return Matcher<::std::tuple<T1, T2>>(new Impl<const ::std::tuple<T1, T2>&>);
-  }
-  template <typename T1, typename T2>
-  operator Matcher<const ::std::tuple<T1, T2>&>() const {
-    return MakeMatcher(new Impl<const ::std::tuple<T1, T2>&>);
-  }
-
- private:
-  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
-    return os << D::Desc();
-  }
-
-  template <typename Tuple>
-  class Impl : public MatcherInterface<Tuple> {
-   public:
-    bool MatchAndExplain(Tuple args,
-                         MatchResultListener* /* listener */) const override {
-      return Op()(::std::get<0>(args), ::std::get<1>(args));
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "are " << GetDesc;
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "aren't " << GetDesc;
-    }
-  };
-};
-
-class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
- public:
-  static const char* Desc() { return "an equal pair"; }
-};
-class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
- public:
-  static const char* Desc() { return "an unequal pair"; }
-};
-class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
- public:
-  static const char* Desc() { return "a pair where the first < the second"; }
-};
-class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
- public:
-  static const char* Desc() { return "a pair where the first > the second"; }
-};
-class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
- public:
-  static const char* Desc() { return "a pair where the first <= the second"; }
-};
-class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
- public:
-  static const char* Desc() { return "a pair where the first >= the second"; }
-};
-
-// Implements the Not(...) matcher for a particular argument type T.
-// We do not nest it inside the NotMatcher class template, as that
-// will prevent different instantiations of NotMatcher from sharing
-// the same NotMatcherImpl<T> class.
-template <typename T>
-class NotMatcherImpl : public MatcherInterface<const T&> {
- public:
-  explicit NotMatcherImpl(const Matcher<T>& matcher) : matcher_(matcher) {}
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    return !matcher_.MatchAndExplain(x, listener);
-  }
-
-  void DescribeTo(::std::ostream* os) const override {
-    matcher_.DescribeNegationTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    matcher_.DescribeTo(os);
-  }
-
- private:
-  const Matcher<T> matcher_;
-};
-
-// Implements the Not(m) matcher, which matches a value that doesn't
-// match matcher m.
-template <typename InnerMatcher>
-class NotMatcher {
- public:
-  explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
-
-  // This template type conversion operator allows Not(m) to be used
-  // to match any type m can match.
-  template <typename T>
-  operator Matcher<T>() const {
-    return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
-  }
-
- private:
-  InnerMatcher matcher_;
-};
-
-// Implements the AllOf(m1, m2) matcher for a particular argument type
-// T. We do not nest it inside the BothOfMatcher class template, as
-// that will prevent different instantiations of BothOfMatcher from
-// sharing the same BothOfMatcherImpl<T> class.
-template <typename T>
-class AllOfMatcherImpl : public MatcherInterface<const T&> {
- public:
-  explicit AllOfMatcherImpl(std::vector<Matcher<T>> matchers)
-      : matchers_(std::move(matchers)) {}
-
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "(";
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      if (i != 0) *os << ") and (";
-      matchers_[i].DescribeTo(os);
-    }
-    *os << ")";
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "(";
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      if (i != 0) *os << ") or (";
-      matchers_[i].DescribeNegationTo(os);
-    }
-    *os << ")";
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    // If either matcher1_ or matcher2_ doesn't match x, we only need
-    // to explain why one of them fails.
-    std::string all_match_result;
-
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      StringMatchResultListener slistener;
-      if (matchers_[i].MatchAndExplain(x, &slistener)) {
-        if (all_match_result.empty()) {
-          all_match_result = slistener.str();
-        } else {
-          std::string result = slistener.str();
-          if (!result.empty()) {
-            all_match_result += ", and ";
-            all_match_result += result;
-          }
-        }
-      } else {
-        *listener << slistener.str();
-        return false;
-      }
-    }
-
-    // Otherwise we need to explain why *both* of them match.
-    *listener << all_match_result;
-    return true;
-  }
-
- private:
-  const std::vector<Matcher<T>> matchers_;
-};
-
-// VariadicMatcher is used for the variadic implementation of
-// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
-// CombiningMatcher<T> is used to recursively combine the provided matchers
-// (of type Args...).
-template <template <typename T> class CombiningMatcher, typename... Args>
-class VariadicMatcher {
- public:
-  VariadicMatcher(const Args&... matchers)  // NOLINT
-      : matchers_(matchers...) {
-    static_assert(sizeof...(Args) > 0, "Must have at least one matcher.");
-  }
-
-  VariadicMatcher(const VariadicMatcher&) = default;
-  VariadicMatcher& operator=(const VariadicMatcher&) = delete;
-
-  // This template type conversion operator allows an
-  // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
-  // all of the provided matchers (Matcher1, Matcher2, ...) can match.
-  template <typename T>
-  operator Matcher<T>() const {
-    std::vector<Matcher<T>> values;
-    CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
-    return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
-  }
-
- private:
-  template <typename T, size_t I>
-  void CreateVariadicMatcher(std::vector<Matcher<T>>* values,
-                             std::integral_constant<size_t, I>) const {
-    values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
-    CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
-  }
-
-  template <typename T>
-  void CreateVariadicMatcher(
-      std::vector<Matcher<T>>*,
-      std::integral_constant<size_t, sizeof...(Args)>) const {}
-
-  std::tuple<Args...> matchers_;
-};
-
-template <typename... Args>
-using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
-
-// Implements the AnyOf(m1, m2) matcher for a particular argument type
-// T.  We do not nest it inside the AnyOfMatcher class template, as
-// that will prevent different instantiations of AnyOfMatcher from
-// sharing the same EitherOfMatcherImpl<T> class.
-template <typename T>
-class AnyOfMatcherImpl : public MatcherInterface<const T&> {
- public:
-  explicit AnyOfMatcherImpl(std::vector<Matcher<T>> matchers)
-      : matchers_(std::move(matchers)) {}
-
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "(";
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      if (i != 0) *os << ") or (";
-      matchers_[i].DescribeTo(os);
-    }
-    *os << ")";
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "(";
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      if (i != 0) *os << ") and (";
-      matchers_[i].DescribeNegationTo(os);
-    }
-    *os << ")";
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    std::string no_match_result;
-
-    // If either matcher1_ or matcher2_ matches x, we just need to
-    // explain why *one* of them matches.
-    for (size_t i = 0; i < matchers_.size(); ++i) {
-      StringMatchResultListener slistener;
-      if (matchers_[i].MatchAndExplain(x, &slistener)) {
-        *listener << slistener.str();
-        return true;
-      } else {
-        if (no_match_result.empty()) {
-          no_match_result = slistener.str();
-        } else {
-          std::string result = slistener.str();
-          if (!result.empty()) {
-            no_match_result += ", and ";
-            no_match_result += result;
-          }
-        }
-      }
-    }
-
-    // Otherwise we need to explain why *both* of them fail.
-    *listener << no_match_result;
-    return false;
-  }
-
- private:
-  const std::vector<Matcher<T>> matchers_;
-};
-
-// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
-template <typename... Args>
-using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
-
-// ConditionalMatcher is the implementation of Conditional(cond, m1, m2)
-template <typename MatcherTrue, typename MatcherFalse>
-class ConditionalMatcher {
- public:
-  ConditionalMatcher(bool condition, MatcherTrue matcher_true,
-                     MatcherFalse matcher_false)
-      : condition_(condition),
-        matcher_true_(std::move(matcher_true)),
-        matcher_false_(std::move(matcher_false)) {}
-
-  template <typename T>
-  operator Matcher<T>() const {  // NOLINT(runtime/explicit)
-    return condition_ ? SafeMatcherCast<T>(matcher_true_)
-                      : SafeMatcherCast<T>(matcher_false_);
-  }
-
- private:
-  bool condition_;
-  MatcherTrue matcher_true_;
-  MatcherFalse matcher_false_;
-};
-
-// Wrapper for implementation of Any/AllOfArray().
-template <template <class> class MatcherImpl, typename T>
-class SomeOfArrayMatcher {
- public:
-  // Constructs the matcher from a sequence of element values or
-  // element matchers.
-  template <typename Iter>
-  SomeOfArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
-
-  template <typename U>
-  operator Matcher<U>() const {  // NOLINT
-    using RawU = typename std::decay<U>::type;
-    std::vector<Matcher<RawU>> matchers;
-    for (const auto& matcher : matchers_) {
-      matchers.push_back(MatcherCast<RawU>(matcher));
-    }
-    return Matcher<U>(new MatcherImpl<RawU>(std::move(matchers)));
-  }
-
- private:
-  const ::std::vector<T> matchers_;
-};
-
-template <typename T>
-using AllOfArrayMatcher = SomeOfArrayMatcher<AllOfMatcherImpl, T>;
-
-template <typename T>
-using AnyOfArrayMatcher = SomeOfArrayMatcher<AnyOfMatcherImpl, T>;
-
-// Used for implementing Truly(pred), which turns a predicate into a
-// matcher.
-template <typename Predicate>
-class TrulyMatcher {
- public:
-  explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
-
-  // This method template allows Truly(pred) to be used as a matcher
-  // for type T where T is the argument type of predicate 'pred'.  The
-  // argument is passed by reference as the predicate may be
-  // interested in the address of the argument.
-  template <typename T>
-  bool MatchAndExplain(T& x,  // NOLINT
-                       MatchResultListener* listener) const {
-    // Without the if-statement, MSVC sometimes warns about converting
-    // a value to bool (warning 4800).
-    //
-    // We cannot write 'return !!predicate_(x);' as that doesn't work
-    // when predicate_(x) returns a class convertible to bool but
-    // having no operator!().
-    if (predicate_(x)) return true;
-    *listener << "didn't satisfy the given predicate";
-    return false;
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "satisfies the given predicate";
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "doesn't satisfy the given predicate";
-  }
-
- private:
-  Predicate predicate_;
-};
-
-// Used for implementing Matches(matcher), which turns a matcher into
-// a predicate.
-template <typename M>
-class MatcherAsPredicate {
- public:
-  explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
-
-  // This template operator() allows Matches(m) to be used as a
-  // predicate on type T where m is a matcher on type T.
-  //
-  // The argument x is passed by reference instead of by value, as
-  // some matcher may be interested in its address (e.g. as in
-  // Matches(Ref(n))(x)).
-  template <typename T>
-  bool operator()(const T& x) const {
-    // We let matcher_ commit to a particular type here instead of
-    // when the MatcherAsPredicate object was constructed.  This
-    // allows us to write Matches(m) where m is a polymorphic matcher
-    // (e.g. Eq(5)).
-    //
-    // If we write Matcher<T>(matcher_).Matches(x) here, it won't
-    // compile when matcher_ has type Matcher<const T&>; if we write
-    // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
-    // when matcher_ has type Matcher<T>; if we just write
-    // matcher_.Matches(x), it won't compile when matcher_ is
-    // polymorphic, e.g. Eq(5).
-    //
-    // MatcherCast<const T&>() is necessary for making the code work
-    // in all of the above situations.
-    return MatcherCast<const T&>(matcher_).Matches(x);
-  }
-
- private:
-  M matcher_;
-};
-
-// For implementing ASSERT_THAT() and EXPECT_THAT().  The template
-// argument M must be a type that can be converted to a matcher.
-template <typename M>
-class PredicateFormatterFromMatcher {
- public:
-  explicit PredicateFormatterFromMatcher(M m) : matcher_(std::move(m)) {}
-
-  // This template () operator allows a PredicateFormatterFromMatcher
-  // object to act as a predicate-formatter suitable for using with
-  // Google Test's EXPECT_PRED_FORMAT1() macro.
-  template <typename T>
-  AssertionResult operator()(const char* value_text, const T& x) const {
-    // We convert matcher_ to a Matcher<const T&> *now* instead of
-    // when the PredicateFormatterFromMatcher object was constructed,
-    // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
-    // know which type to instantiate it to until we actually see the
-    // type of x here.
-    //
-    // We write SafeMatcherCast<const T&>(matcher_) instead of
-    // Matcher<const T&>(matcher_), as the latter won't compile when
-    // matcher_ has type Matcher<T> (e.g. An<int>()).
-    // We don't write MatcherCast<const T&> either, as that allows
-    // potentially unsafe downcasting of the matcher argument.
-    const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
-
-    // The expected path here is that the matcher should match (i.e. that most
-    // tests pass) so optimize for this case.
-    if (matcher.Matches(x)) {
-      return AssertionSuccess();
-    }
-
-    ::std::stringstream ss;
-    ss << "Value of: " << value_text << "\n"
-       << "Expected: ";
-    matcher.DescribeTo(&ss);
-
-    // Rerun the matcher to "PrintAndExplain" the failure.
-    StringMatchResultListener listener;
-    if (MatchPrintAndExplain(x, matcher, &listener)) {
-      ss << "\n  The matcher failed on the initial attempt; but passed when "
-            "rerun to generate the explanation.";
-    }
-    ss << "\n  Actual: " << listener.str();
-    return AssertionFailure() << ss.str();
-  }
-
- private:
-  const M matcher_;
-};
-
-// A helper function for converting a matcher to a predicate-formatter
-// without the user needing to explicitly write the type.  This is
-// used for implementing ASSERT_THAT() and EXPECT_THAT().
-// Implementation detail: 'matcher' is received by-value to force decaying.
-template <typename M>
-inline PredicateFormatterFromMatcher<M> MakePredicateFormatterFromMatcher(
-    M matcher) {
-  return PredicateFormatterFromMatcher<M>(std::move(matcher));
-}
-
-// Implements the polymorphic IsNan() matcher, which matches any floating type
-// value that is Nan.
-class IsNanMatcher {
- public:
-  template <typename FloatType>
-  bool MatchAndExplain(const FloatType& f,
-                       MatchResultListener* /* listener */) const {
-    return (::std::isnan)(f);
-  }
-
-  void DescribeTo(::std::ostream* os) const { *os << "is NaN"; }
-  void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NaN"; }
-};
-
-// Implements the polymorphic floating point equality matcher, which matches
-// two float values using ULP-based approximation or, optionally, a
-// user-specified epsilon.  The template is meant to be instantiated with
-// FloatType being either float or double.
-template <typename FloatType>
-class FloatingEqMatcher {
- public:
-  // Constructor for FloatingEqMatcher.
-  // The matcher's input will be compared with expected.  The matcher treats two
-  // NANs as equal if nan_eq_nan is true.  Otherwise, under IEEE standards,
-  // equality comparisons between NANs will always return false.  We specify a
-  // negative max_abs_error_ term to indicate that ULP-based approximation will
-  // be used for comparison.
-  FloatingEqMatcher(FloatType expected, bool nan_eq_nan)
-      : expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {}
-
-  // Constructor that supports a user-specified max_abs_error that will be used
-  // for comparison instead of ULP-based approximation.  The max absolute
-  // should be non-negative.
-  FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
-                    FloatType max_abs_error)
-      : expected_(expected),
-        nan_eq_nan_(nan_eq_nan),
-        max_abs_error_(max_abs_error) {
-    GTEST_CHECK_(max_abs_error >= 0)
-        << ", where max_abs_error is" << max_abs_error;
-  }
-
-  // Implements floating point equality matcher as a Matcher<T>.
-  template <typename T>
-  class Impl : public MatcherInterface<T> {
-   public:
-    Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
-        : expected_(expected),
-          nan_eq_nan_(nan_eq_nan),
-          max_abs_error_(max_abs_error) {}
-
-    bool MatchAndExplain(T value,
-                         MatchResultListener* listener) const override {
-      const FloatingPoint<FloatType> actual(value), expected(expected_);
-
-      // Compares NaNs first, if nan_eq_nan_ is true.
-      if (actual.is_nan() || expected.is_nan()) {
-        if (actual.is_nan() && expected.is_nan()) {
-          return nan_eq_nan_;
-        }
-        // One is nan; the other is not nan.
-        return false;
-      }
-      if (HasMaxAbsError()) {
-        // We perform an equality check so that inf will match inf, regardless
-        // of error bounds.  If the result of value - expected_ would result in
-        // overflow or if either value is inf, the default result is infinity,
-        // which should only match if max_abs_error_ is also infinity.
-        if (value == expected_) {
-          return true;
-        }
-
-        const FloatType diff = value - expected_;
-        if (::std::fabs(diff) <= max_abs_error_) {
-          return true;
-        }
-
-        if (listener->IsInterested()) {
-          *listener << "which is " << diff << " from " << expected_;
-        }
-        return false;
-      } else {
-        return actual.AlmostEquals(expected);
-      }
-    }
-
-    void DescribeTo(::std::ostream* os) const override {
-      // os->precision() returns the previously set precision, which we
-      // store to restore the ostream to its original configuration
-      // after outputting.
-      const ::std::streamsize old_precision =
-          os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
-      if (FloatingPoint<FloatType>(expected_).is_nan()) {
-        if (nan_eq_nan_) {
-          *os << "is NaN";
-        } else {
-          *os << "never matches";
-        }
-      } else {
-        *os << "is approximately " << expected_;
-        if (HasMaxAbsError()) {
-          *os << " (absolute error <= " << max_abs_error_ << ")";
-        }
-      }
-      os->precision(old_precision);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      // As before, get original precision.
-      const ::std::streamsize old_precision =
-          os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
-      if (FloatingPoint<FloatType>(expected_).is_nan()) {
-        if (nan_eq_nan_) {
-          *os << "isn't NaN";
-        } else {
-          *os << "is anything";
-        }
-      } else {
-        *os << "isn't approximately " << expected_;
-        if (HasMaxAbsError()) {
-          *os << " (absolute error > " << max_abs_error_ << ")";
-        }
-      }
-      // Restore original precision.
-      os->precision(old_precision);
-    }
-
-   private:
-    bool HasMaxAbsError() const { return max_abs_error_ >= 0; }
-
-    const FloatType expected_;
-    const bool nan_eq_nan_;
-    // max_abs_error will be used for value comparison when >= 0.
-    const FloatType max_abs_error_;
-  };
-
-  // The following 3 type conversion operators allow FloatEq(expected) and
-  // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
-  // Matcher<const float&>, or a Matcher<float&>, but nothing else.
-  operator Matcher<FloatType>() const {
-    return MakeMatcher(
-        new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
-  }
-
-  operator Matcher<const FloatType&>() const {
-    return MakeMatcher(
-        new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
-  }
-
-  operator Matcher<FloatType&>() const {
-    return MakeMatcher(
-        new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
-  }
-
- private:
-  const FloatType expected_;
-  const bool nan_eq_nan_;
-  // max_abs_error will be used for value comparison when >= 0.
-  const FloatType max_abs_error_;
-};
-
-// A 2-tuple ("binary") wrapper around FloatingEqMatcher:
-// FloatingEq2Matcher() matches (x, y) by matching FloatingEqMatcher(x, false)
-// against y, and FloatingEq2Matcher(e) matches FloatingEqMatcher(x, false, e)
-// against y. The former implements "Eq", the latter "Near". At present, there
-// is no version that compares NaNs as equal.
-template <typename FloatType>
-class FloatingEq2Matcher {
- public:
-  FloatingEq2Matcher() { Init(-1, false); }
-
-  explicit FloatingEq2Matcher(bool nan_eq_nan) { Init(-1, nan_eq_nan); }
-
-  explicit FloatingEq2Matcher(FloatType max_abs_error) {
-    Init(max_abs_error, false);
-  }
-
-  FloatingEq2Matcher(FloatType max_abs_error, bool nan_eq_nan) {
-    Init(max_abs_error, nan_eq_nan);
-  }
-
-  template <typename T1, typename T2>
-  operator Matcher<::std::tuple<T1, T2>>() const {
-    return MakeMatcher(
-        new Impl<::std::tuple<T1, T2>>(max_abs_error_, nan_eq_nan_));
-  }
-  template <typename T1, typename T2>
-  operator Matcher<const ::std::tuple<T1, T2>&>() const {
-    return MakeMatcher(
-        new Impl<const ::std::tuple<T1, T2>&>(max_abs_error_, nan_eq_nan_));
-  }
-
- private:
-  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
-    return os << "an almost-equal pair";
-  }
-
-  template <typename Tuple>
-  class Impl : public MatcherInterface<Tuple> {
-   public:
-    Impl(FloatType max_abs_error, bool nan_eq_nan)
-        : max_abs_error_(max_abs_error), nan_eq_nan_(nan_eq_nan) {}
-
-    bool MatchAndExplain(Tuple args,
-                         MatchResultListener* listener) const override {
-      if (max_abs_error_ == -1) {
-        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_);
-        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
-            ::std::get<1>(args), listener);
-      } else {
-        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_,
-                                        max_abs_error_);
-        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
-            ::std::get<1>(args), listener);
-      }
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "are " << GetDesc;
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "aren't " << GetDesc;
-    }
-
-   private:
-    FloatType max_abs_error_;
-    const bool nan_eq_nan_;
-  };
-
-  void Init(FloatType max_abs_error_val, bool nan_eq_nan_val) {
-    max_abs_error_ = max_abs_error_val;
-    nan_eq_nan_ = nan_eq_nan_val;
-  }
-  FloatType max_abs_error_;
-  bool nan_eq_nan_;
-};
-
-// Implements the Pointee(m) matcher for matching a pointer whose
-// pointee matches matcher m.  The pointer can be either raw or smart.
-template <typename InnerMatcher>
-class PointeeMatcher {
- public:
-  explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
-
-  // This type conversion operator template allows Pointee(m) to be
-  // used as a matcher for any pointer type whose pointee type is
-  // compatible with the inner matcher, where type Pointer can be
-  // either a raw pointer or a smart pointer.
-  //
-  // The reason we do this instead of relying on
-  // MakePolymorphicMatcher() is that the latter is not flexible
-  // enough for implementing the DescribeTo() method of Pointee().
-  template <typename Pointer>
-  operator Matcher<Pointer>() const {
-    return Matcher<Pointer>(new Impl<const Pointer&>(matcher_));
-  }
-
- private:
-  // The monomorphic implementation that works for a particular pointer type.
-  template <typename Pointer>
-  class Impl : public MatcherInterface<Pointer> {
-   public:
-    using Pointee =
-        typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
-            Pointer)>::element_type;
-
-    explicit Impl(const InnerMatcher& matcher)
-        : matcher_(MatcherCast<const Pointee&>(matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "points to a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "does not point to a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    bool MatchAndExplain(Pointer pointer,
-                         MatchResultListener* listener) const override {
-      if (GetRawPointer(pointer) == nullptr) return false;
-
-      *listener << "which points to ";
-      return MatchPrintAndExplain(*pointer, matcher_, listener);
-    }
-
-   private:
-    const Matcher<const Pointee&> matcher_;
-  };
-
-  const InnerMatcher matcher_;
-};
-
-// Implements the Pointer(m) matcher
-// Implements the Pointer(m) matcher for matching a pointer that matches matcher
-// m.  The pointer can be either raw or smart, and will match `m` against the
-// raw pointer.
-template <typename InnerMatcher>
-class PointerMatcher {
- public:
-  explicit PointerMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
-
-  // This type conversion operator template allows Pointer(m) to be
-  // used as a matcher for any pointer type whose pointer type is
-  // compatible with the inner matcher, where type PointerType can be
-  // either a raw pointer or a smart pointer.
-  //
-  // The reason we do this instead of relying on
-  // MakePolymorphicMatcher() is that the latter is not flexible
-  // enough for implementing the DescribeTo() method of Pointer().
-  template <typename PointerType>
-  operator Matcher<PointerType>() const {  // NOLINT
-    return Matcher<PointerType>(new Impl<const PointerType&>(matcher_));
-  }
-
- private:
-  // The monomorphic implementation that works for a particular pointer type.
-  template <typename PointerType>
-  class Impl : public MatcherInterface<PointerType> {
-   public:
-    using Pointer =
-        const typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
-            PointerType)>::element_type*;
-
-    explicit Impl(const InnerMatcher& matcher)
-        : matcher_(MatcherCast<Pointer>(matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "is a pointer that ";
-      matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "is not a pointer that ";
-      matcher_.DescribeTo(os);
-    }
-
-    bool MatchAndExplain(PointerType pointer,
-                         MatchResultListener* listener) const override {
-      *listener << "which is a pointer that ";
-      Pointer p = GetRawPointer(pointer);
-      return MatchPrintAndExplain(p, matcher_, listener);
-    }
-
-   private:
-    Matcher<Pointer> matcher_;
-  };
-
-  const InnerMatcher matcher_;
-};
-
-#if GTEST_HAS_RTTI
-// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
-// reference that matches inner_matcher when dynamic_cast<T> is applied.
-// The result of dynamic_cast<To> is forwarded to the inner matcher.
-// If To is a pointer and the cast fails, the inner matcher will receive NULL.
-// If To is a reference and the cast fails, this matcher returns false
-// immediately.
-template <typename To>
-class WhenDynamicCastToMatcherBase {
- public:
-  explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
-      : matcher_(matcher) {}
-
-  void DescribeTo(::std::ostream* os) const {
-    GetCastTypeDescription(os);
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    GetCastTypeDescription(os);
-    matcher_.DescribeNegationTo(os);
-  }
-
- protected:
-  const Matcher<To> matcher_;
-
-  static std::string GetToName() { return GetTypeName<To>(); }
-
- private:
-  static void GetCastTypeDescription(::std::ostream* os) {
-    *os << "when dynamic_cast to " << GetToName() << ", ";
-  }
-};
-
-// Primary template.
-// To is a pointer. Cast and forward the result.
-template <typename To>
-class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
- public:
-  explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
-      : WhenDynamicCastToMatcherBase<To>(matcher) {}
-
-  template <typename From>
-  bool MatchAndExplain(From from, MatchResultListener* listener) const {
-    To to = dynamic_cast<To>(from);
-    return MatchPrintAndExplain(to, this->matcher_, listener);
-  }
-};
-
-// Specialize for references.
-// In this case we return false if the dynamic_cast fails.
-template <typename To>
-class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
- public:
-  explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
-      : WhenDynamicCastToMatcherBase<To&>(matcher) {}
-
-  template <typename From>
-  bool MatchAndExplain(From& from, MatchResultListener* listener) const {
-    // We don't want an std::bad_cast here, so do the cast with pointers.
-    To* to = dynamic_cast<To*>(&from);
-    if (to == nullptr) {
-      *listener << "which cannot be dynamic_cast to " << this->GetToName();
-      return false;
-    }
-    return MatchPrintAndExplain(*to, this->matcher_, listener);
-  }
-};
-#endif  // GTEST_HAS_RTTI
-
-// Implements the Field() matcher for matching a field (i.e. member
-// variable) of an object.
-template <typename Class, typename FieldType>
-class FieldMatcher {
- public:
-  FieldMatcher(FieldType Class::*field,
-               const Matcher<const FieldType&>& matcher)
-      : field_(field), matcher_(matcher), whose_field_("whose given field ") {}
-
-  FieldMatcher(const std::string& field_name, FieldType Class::*field,
-               const Matcher<const FieldType&>& matcher)
-      : field_(field),
-        matcher_(matcher),
-        whose_field_("whose field `" + field_name + "` ") {}
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "is an object " << whose_field_;
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "is an object " << whose_field_;
-    matcher_.DescribeNegationTo(os);
-  }
-
-  template <typename T>
-  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
-    // FIXME: The dispatch on std::is_pointer was introduced as a workaround for
-    // a compiler bug, and can now be removed.
-    return MatchAndExplainImpl(
-        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
-        value, listener);
-  }
-
- private:
-  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
-                           const Class& obj,
-                           MatchResultListener* listener) const {
-    *listener << whose_field_ << "is ";
-    return MatchPrintAndExplain(obj.*field_, matcher_, listener);
-  }
-
-  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
-                           MatchResultListener* listener) const {
-    if (p == nullptr) return false;
-
-    *listener << "which points to an object ";
-    // Since *p has a field, it must be a class/struct/union type and
-    // thus cannot be a pointer.  Therefore we pass false_type() as
-    // the first argument.
-    return MatchAndExplainImpl(std::false_type(), *p, listener);
-  }
-
-  const FieldType Class::*field_;
-  const Matcher<const FieldType&> matcher_;
-
-  // Contains either "whose given field " if the name of the field is unknown
-  // or "whose field `name_of_field` " if the name is known.
-  const std::string whose_field_;
-};
-
-// Implements the Property() matcher for matching a property
-// (i.e. return value of a getter method) of an object.
-//
-// Property is a const-qualified member function of Class returning
-// PropertyType.
-template <typename Class, typename PropertyType, typename Property>
-class PropertyMatcher {
- public:
-  typedef const PropertyType& RefToConstProperty;
-
-  PropertyMatcher(Property property, const Matcher<RefToConstProperty>& matcher)
-      : property_(property),
-        matcher_(matcher),
-        whose_property_("whose given property ") {}
-
-  PropertyMatcher(const std::string& property_name, Property property,
-                  const Matcher<RefToConstProperty>& matcher)
-      : property_(property),
-        matcher_(matcher),
-        whose_property_("whose property `" + property_name + "` ") {}
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "is an object " << whose_property_;
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "is an object " << whose_property_;
-    matcher_.DescribeNegationTo(os);
-  }
-
-  template <typename T>
-  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
-    return MatchAndExplainImpl(
-        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
-        value, listener);
-  }
-
- private:
-  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
-                           const Class& obj,
-                           MatchResultListener* listener) const {
-    *listener << whose_property_ << "is ";
-    // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
-    // which takes a non-const reference as argument.
-    RefToConstProperty result = (obj.*property_)();
-    return MatchPrintAndExplain(result, matcher_, listener);
-  }
-
-  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
-                           MatchResultListener* listener) const {
-    if (p == nullptr) return false;
-
-    *listener << "which points to an object ";
-    // Since *p has a property method, it must be a class/struct/union
-    // type and thus cannot be a pointer.  Therefore we pass
-    // false_type() as the first argument.
-    return MatchAndExplainImpl(std::false_type(), *p, listener);
-  }
-
-  Property property_;
-  const Matcher<RefToConstProperty> matcher_;
-
-  // Contains either "whose given property " if the name of the property is
-  // unknown or "whose property `name_of_property` " if the name is known.
-  const std::string whose_property_;
-};
-
-// Type traits specifying various features of different functors for ResultOf.
-// The default template specifies features for functor objects.
-template <typename Functor>
-struct CallableTraits {
-  typedef Functor StorageType;
-
-  static void CheckIsValid(Functor /* functor */) {}
-
-  template <typename T>
-  static auto Invoke(Functor f, const T& arg) -> decltype(f(arg)) {
-    return f(arg);
-  }
-};
-
-// Specialization for function pointers.
-template <typename ArgType, typename ResType>
-struct CallableTraits<ResType (*)(ArgType)> {
-  typedef ResType ResultType;
-  typedef ResType (*StorageType)(ArgType);
-
-  static void CheckIsValid(ResType (*f)(ArgType)) {
-    GTEST_CHECK_(f != nullptr)
-        << "NULL function pointer is passed into ResultOf().";
-  }
-  template <typename T>
-  static ResType Invoke(ResType (*f)(ArgType), T arg) {
-    return (*f)(arg);
-  }
-};
-
-// Implements the ResultOf() matcher for matching a return value of a
-// unary function of an object.
-template <typename Callable, typename InnerMatcher>
-class ResultOfMatcher {
- public:
-  ResultOfMatcher(Callable callable, InnerMatcher matcher)
-      : ResultOfMatcher(/*result_description=*/"", std::move(callable),
-                        std::move(matcher)) {}
-
-  ResultOfMatcher(const std::string& result_description, Callable callable,
-                  InnerMatcher matcher)
-      : result_description_(result_description),
-        callable_(std::move(callable)),
-        matcher_(std::move(matcher)) {
-    CallableTraits<Callable>::CheckIsValid(callable_);
-  }
-
-  template <typename T>
-  operator Matcher<T>() const {
-    return Matcher<T>(
-        new Impl<const T&>(result_description_, callable_, matcher_));
-  }
-
- private:
-  typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
-
-  template <typename T>
-  class Impl : public MatcherInterface<T> {
-    using ResultType = decltype(CallableTraits<Callable>::template Invoke<T>(
-        std::declval<CallableStorageType>(), std::declval<T>()));
-
-   public:
-    template <typename M>
-    Impl(const std::string& result_description,
-         const CallableStorageType& callable, const M& matcher)
-        : result_description_(result_description),
-          callable_(callable),
-          matcher_(MatcherCast<ResultType>(matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      if (result_description_.empty()) {
-        *os << "is mapped by the given callable to a value that ";
-      } else {
-        *os << "whose " << result_description_ << " ";
-      }
-      matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      if (result_description_.empty()) {
-        *os << "is mapped by the given callable to a value that ";
-      } else {
-        *os << "whose " << result_description_ << " ";
-      }
-      matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
-      if (result_description_.empty()) {
-        *listener << "which is mapped by the given callable to ";
-      } else {
-        *listener << "whose " << result_description_ << " is ";
-      }
-      // Cannot pass the return value directly to MatchPrintAndExplain, which
-      // takes a non-const reference as argument.
-      // Also, specifying template argument explicitly is needed because T could
-      // be a non-const reference (e.g. Matcher<Uncopyable&>).
-      ResultType result =
-          CallableTraits<Callable>::template Invoke<T>(callable_, obj);
-      return MatchPrintAndExplain(result, matcher_, listener);
-    }
-
-   private:
-    const std::string result_description_;
-    // Functors often define operator() as non-const method even though
-    // they are actually stateless. But we need to use them even when
-    // 'this' is a const pointer. It's the user's responsibility not to
-    // use stateful callables with ResultOf(), which doesn't guarantee
-    // how many times the callable will be invoked.
-    mutable CallableStorageType callable_;
-    const Matcher<ResultType> matcher_;
-  };  // class Impl
-
-  const std::string result_description_;
-  const CallableStorageType callable_;
-  const InnerMatcher matcher_;
-};
-
-// Implements a matcher that checks the size of an STL-style container.
-template <typename SizeMatcher>
-class SizeIsMatcher {
- public:
-  explicit SizeIsMatcher(const SizeMatcher& size_matcher)
-      : size_matcher_(size_matcher) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    return Matcher<Container>(new Impl<const Container&>(size_matcher_));
-  }
-
-  template <typename Container>
-  class Impl : public MatcherInterface<Container> {
-   public:
-    using SizeType = decltype(std::declval<Container>().size());
-    explicit Impl(const SizeMatcher& size_matcher)
-        : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "size ";
-      size_matcher_.DescribeTo(os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "size ";
-      size_matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(Container container,
-                         MatchResultListener* listener) const override {
-      SizeType size = container.size();
-      StringMatchResultListener size_listener;
-      const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
-      *listener << "whose size " << size
-                << (result ? " matches" : " doesn't match");
-      PrintIfNotEmpty(size_listener.str(), listener->stream());
-      return result;
-    }
-
-   private:
-    const Matcher<SizeType> size_matcher_;
-  };
-
- private:
-  const SizeMatcher size_matcher_;
-};
-
-// Implements a matcher that checks the begin()..end() distance of an STL-style
-// container.
-template <typename DistanceMatcher>
-class BeginEndDistanceIsMatcher {
- public:
-  explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
-      : distance_matcher_(distance_matcher) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    return Matcher<Container>(new Impl<const Container&>(distance_matcher_));
-  }
-
-  template <typename Container>
-  class Impl : public MatcherInterface<Container> {
-   public:
-    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
-        Container)>
-        ContainerView;
-    typedef typename std::iterator_traits<
-        typename ContainerView::type::const_iterator>::difference_type
-        DistanceType;
-    explicit Impl(const DistanceMatcher& distance_matcher)
-        : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "distance between begin() and end() ";
-      distance_matcher_.DescribeTo(os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "distance between begin() and end() ";
-      distance_matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(Container container,
-                         MatchResultListener* listener) const override {
-      using std::begin;
-      using std::end;
-      DistanceType distance = std::distance(begin(container), end(container));
-      StringMatchResultListener distance_listener;
-      const bool result =
-          distance_matcher_.MatchAndExplain(distance, &distance_listener);
-      *listener << "whose distance between begin() and end() " << distance
-                << (result ? " matches" : " doesn't match");
-      PrintIfNotEmpty(distance_listener.str(), listener->stream());
-      return result;
-    }
-
-   private:
-    const Matcher<DistanceType> distance_matcher_;
-  };
-
- private:
-  const DistanceMatcher distance_matcher_;
-};
-
-// Implements an equality matcher for any STL-style container whose elements
-// support ==. This matcher is like Eq(), but its failure explanations provide
-// more detailed information that is useful when the container is used as a set.
-// The failure message reports elements that are in one of the operands but not
-// the other. The failure messages do not report duplicate or out-of-order
-// elements in the containers (which don't properly matter to sets, but can
-// occur if the containers are vectors or lists, for example).
-//
-// Uses the container's const_iterator, value_type, operator ==,
-// begin(), and end().
-template <typename Container>
-class ContainerEqMatcher {
- public:
-  typedef internal::StlContainerView<Container> View;
-  typedef typename View::type StlContainer;
-  typedef typename View::const_reference StlContainerReference;
-
-  static_assert(!std::is_const<Container>::value,
-                "Container type must not be const");
-  static_assert(!std::is_reference<Container>::value,
-                "Container type must not be a reference");
-
-  // We make a copy of expected in case the elements in it are modified
-  // after this matcher is created.
-  explicit ContainerEqMatcher(const Container& expected)
-      : expected_(View::Copy(expected)) {}
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << "equals ";
-    UniversalPrint(expected_, os);
-  }
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "does not equal ";
-    UniversalPrint(expected_, os);
-  }
-
-  template <typename LhsContainer>
-  bool MatchAndExplain(const LhsContainer& lhs,
-                       MatchResultListener* listener) const {
-    typedef internal::StlContainerView<
-        typename std::remove_const<LhsContainer>::type>
-        LhsView;
-    StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
-    if (lhs_stl_container == expected_) return true;
-
-    ::std::ostream* const os = listener->stream();
-    if (os != nullptr) {
-      // Something is different. Check for extra values first.
-      bool printed_header = false;
-      for (auto it = lhs_stl_container.begin(); it != lhs_stl_container.end();
-           ++it) {
-        if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
-            expected_.end()) {
-          if (printed_header) {
-            *os << ", ";
-          } else {
-            *os << "which has these unexpected elements: ";
-            printed_header = true;
-          }
-          UniversalPrint(*it, os);
-        }
-      }
-
-      // Now check for missing values.
-      bool printed_header2 = false;
-      for (auto it = expected_.begin(); it != expected_.end(); ++it) {
-        if (internal::ArrayAwareFind(lhs_stl_container.begin(),
-                                     lhs_stl_container.end(),
-                                     *it) == lhs_stl_container.end()) {
-          if (printed_header2) {
-            *os << ", ";
-          } else {
-            *os << (printed_header ? ",\nand" : "which")
-                << " doesn't have these expected elements: ";
-            printed_header2 = true;
-          }
-          UniversalPrint(*it, os);
-        }
-      }
-    }
-
-    return false;
-  }
-
- private:
-  const StlContainer expected_;
-};
-
-// A comparator functor that uses the < operator to compare two values.
-struct LessComparator {
-  template <typename T, typename U>
-  bool operator()(const T& lhs, const U& rhs) const {
-    return lhs < rhs;
-  }
-};
-
-// Implements WhenSortedBy(comparator, container_matcher).
-template <typename Comparator, typename ContainerMatcher>
-class WhenSortedByMatcher {
- public:
-  WhenSortedByMatcher(const Comparator& comparator,
-                      const ContainerMatcher& matcher)
-      : comparator_(comparator), matcher_(matcher) {}
-
-  template <typename LhsContainer>
-  operator Matcher<LhsContainer>() const {
-    return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
-  }
-
-  template <typename LhsContainer>
-  class Impl : public MatcherInterface<LhsContainer> {
-   public:
-    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
-        LhsContainer)>
-        LhsView;
-    typedef typename LhsView::type LhsStlContainer;
-    typedef typename LhsView::const_reference LhsStlContainerReference;
-    // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
-    // so that we can match associative containers.
-    typedef
-        typename RemoveConstFromKey<typename LhsStlContainer::value_type>::type
-            LhsValue;
-
-    Impl(const Comparator& comparator, const ContainerMatcher& matcher)
-        : comparator_(comparator), matcher_(matcher) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "(when sorted) ";
-      matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "(when sorted) ";
-      matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(LhsContainer lhs,
-                         MatchResultListener* listener) const override {
-      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
-      ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
-                                               lhs_stl_container.end());
-      ::std::sort(sorted_container.begin(), sorted_container.end(),
-                  comparator_);
-
-      if (!listener->IsInterested()) {
-        // If the listener is not interested, we do not need to
-        // construct the inner explanation.
-        return matcher_.Matches(sorted_container);
-      }
-
-      *listener << "which is ";
-      UniversalPrint(sorted_container, listener->stream());
-      *listener << " when sorted";
-
-      StringMatchResultListener inner_listener;
-      const bool match =
-          matcher_.MatchAndExplain(sorted_container, &inner_listener);
-      PrintIfNotEmpty(inner_listener.str(), listener->stream());
-      return match;
-    }
-
-   private:
-    const Comparator comparator_;
-    const Matcher<const ::std::vector<LhsValue>&> matcher_;
-
-    Impl(const Impl&) = delete;
-    Impl& operator=(const Impl&) = delete;
-  };
-
- private:
-  const Comparator comparator_;
-  const ContainerMatcher matcher_;
-};
-
-// Implements Pointwise(tuple_matcher, rhs_container).  tuple_matcher
-// must be able to be safely cast to Matcher<std::tuple<const T1&, const
-// T2&> >, where T1 and T2 are the types of elements in the LHS
-// container and the RHS container respectively.
-template <typename TupleMatcher, typename RhsContainer>
-class PointwiseMatcher {
-  static_assert(
-      !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
-      "use UnorderedPointwise with hash tables");
-
- public:
-  typedef internal::StlContainerView<RhsContainer> RhsView;
-  typedef typename RhsView::type RhsStlContainer;
-  typedef typename RhsStlContainer::value_type RhsValue;
-
-  static_assert(!std::is_const<RhsContainer>::value,
-                "RhsContainer type must not be const");
-  static_assert(!std::is_reference<RhsContainer>::value,
-                "RhsContainer type must not be a reference");
-
-  // Like ContainerEq, we make a copy of rhs in case the elements in
-  // it are modified after this matcher is created.
-  PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
-      : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {}
-
-  template <typename LhsContainer>
-  operator Matcher<LhsContainer>() const {
-    static_assert(
-        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
-        "use UnorderedPointwise with hash tables");
-
-    return Matcher<LhsContainer>(
-        new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
-  }
-
-  template <typename LhsContainer>
-  class Impl : public MatcherInterface<LhsContainer> {
-   public:
-    typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
-        LhsContainer)>
-        LhsView;
-    typedef typename LhsView::type LhsStlContainer;
-    typedef typename LhsView::const_reference LhsStlContainerReference;
-    typedef typename LhsStlContainer::value_type LhsValue;
-    // We pass the LHS value and the RHS value to the inner matcher by
-    // reference, as they may be expensive to copy.  We must use tuple
-    // instead of pair here, as a pair cannot hold references (C++ 98,
-    // 20.2.2 [lib.pairs]).
-    typedef ::std::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
-
-    Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
-        // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
-        : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
-          rhs_(rhs) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "contains " << rhs_.size()
-          << " values, where each value and its corresponding value in ";
-      UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
-      *os << " ";
-      mono_tuple_matcher_.DescribeTo(os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "doesn't contain exactly " << rhs_.size()
-          << " values, or contains a value x at some index i"
-          << " where x and the i-th value of ";
-      UniversalPrint(rhs_, os);
-      *os << " ";
-      mono_tuple_matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(LhsContainer lhs,
-                         MatchResultListener* listener) const override {
-      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
-      const size_t actual_size = lhs_stl_container.size();
-      if (actual_size != rhs_.size()) {
-        *listener << "which contains " << actual_size << " values";
-        return false;
-      }
-
-      auto left = lhs_stl_container.begin();
-      auto right = rhs_.begin();
-      for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
-        if (listener->IsInterested()) {
-          StringMatchResultListener inner_listener;
-          // Create InnerMatcherArg as a temporarily object to avoid it outlives
-          // *left and *right. Dereference or the conversion to `const T&` may
-          // return temp objects, e.g. for vector<bool>.
-          if (!mono_tuple_matcher_.MatchAndExplain(
-                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
-                                  ImplicitCast_<const RhsValue&>(*right)),
-                  &inner_listener)) {
-            *listener << "where the value pair (";
-            UniversalPrint(*left, listener->stream());
-            *listener << ", ";
-            UniversalPrint(*right, listener->stream());
-            *listener << ") at index #" << i << " don't match";
-            PrintIfNotEmpty(inner_listener.str(), listener->stream());
-            return false;
-          }
-        } else {
-          if (!mono_tuple_matcher_.Matches(
-                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
-                                  ImplicitCast_<const RhsValue&>(*right))))
-            return false;
-        }
-      }
-
-      return true;
-    }
-
-   private:
-    const Matcher<InnerMatcherArg> mono_tuple_matcher_;
-    const RhsStlContainer rhs_;
-  };
-
- private:
-  const TupleMatcher tuple_matcher_;
-  const RhsStlContainer rhs_;
-};
-
-// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
-template <typename Container>
-class QuantifierMatcherImpl : public MatcherInterface<Container> {
- public:
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
-  typedef StlContainerView<RawContainer> View;
-  typedef typename View::type StlContainer;
-  typedef typename View::const_reference StlContainerReference;
-  typedef typename StlContainer::value_type Element;
-
-  template <typename InnerMatcher>
-  explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
-      : inner_matcher_(
-            testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
-
-  // Checks whether:
-  // * All elements in the container match, if all_elements_should_match.
-  // * Any element in the container matches, if !all_elements_should_match.
-  bool MatchAndExplainImpl(bool all_elements_should_match, Container container,
-                           MatchResultListener* listener) const {
-    StlContainerReference stl_container = View::ConstReference(container);
-    size_t i = 0;
-    for (auto it = stl_container.begin(); it != stl_container.end();
-         ++it, ++i) {
-      StringMatchResultListener inner_listener;
-      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
-
-      if (matches != all_elements_should_match) {
-        *listener << "whose element #" << i
-                  << (matches ? " matches" : " doesn't match");
-        PrintIfNotEmpty(inner_listener.str(), listener->stream());
-        return !all_elements_should_match;
-      }
-    }
-    return all_elements_should_match;
-  }
-
-  bool MatchAndExplainImpl(const Matcher<size_t>& count_matcher,
-                           Container container,
-                           MatchResultListener* listener) const {
-    StlContainerReference stl_container = View::ConstReference(container);
-    size_t i = 0;
-    std::vector<size_t> match_elements;
-    for (auto it = stl_container.begin(); it != stl_container.end();
-         ++it, ++i) {
-      StringMatchResultListener inner_listener;
-      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
-      if (matches) {
-        match_elements.push_back(i);
-      }
-    }
-    if (listener->IsInterested()) {
-      if (match_elements.empty()) {
-        *listener << "has no element that matches";
-      } else if (match_elements.size() == 1) {
-        *listener << "whose element #" << match_elements[0] << " matches";
-      } else {
-        *listener << "whose elements (";
-        std::string sep = "";
-        for (size_t e : match_elements) {
-          *listener << sep << e;
-          sep = ", ";
-        }
-        *listener << ") match";
-      }
-    }
-    StringMatchResultListener count_listener;
-    if (count_matcher.MatchAndExplain(match_elements.size(), &count_listener)) {
-      *listener << " and whose match quantity of " << match_elements.size()
-                << " matches";
-      PrintIfNotEmpty(count_listener.str(), listener->stream());
-      return true;
-    } else {
-      if (match_elements.empty()) {
-        *listener << " and";
-      } else {
-        *listener << " but";
-      }
-      *listener << " whose match quantity of " << match_elements.size()
-                << " does not match";
-      PrintIfNotEmpty(count_listener.str(), listener->stream());
-      return false;
-    }
-  }
-
- protected:
-  const Matcher<const Element&> inner_matcher_;
-};
-
-// Implements Contains(element_matcher) for the given argument type Container.
-// Symmetric to EachMatcherImpl.
-template <typename Container>
-class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
- public:
-  template <typename InnerMatcher>
-  explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
-      : QuantifierMatcherImpl<Container>(inner_matcher) {}
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "contains at least one element that ";
-    this->inner_matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "doesn't contain any element that ";
-    this->inner_matcher_.DescribeTo(os);
-  }
-
-  bool MatchAndExplain(Container container,
-                       MatchResultListener* listener) const override {
-    return this->MatchAndExplainImpl(false, container, listener);
-  }
-};
-
-// Implements Each(element_matcher) for the given argument type Container.
-// Symmetric to ContainsMatcherImpl.
-template <typename Container>
-class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
- public:
-  template <typename InnerMatcher>
-  explicit EachMatcherImpl(InnerMatcher inner_matcher)
-      : QuantifierMatcherImpl<Container>(inner_matcher) {}
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "only contains elements that ";
-    this->inner_matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "contains some element that ";
-    this->inner_matcher_.DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(Container container,
-                       MatchResultListener* listener) const override {
-    return this->MatchAndExplainImpl(true, container, listener);
-  }
-};
-
-// Implements Contains(element_matcher).Times(n) for the given argument type
-// Container.
-template <typename Container>
-class ContainsTimesMatcherImpl : public QuantifierMatcherImpl<Container> {
- public:
-  template <typename InnerMatcher>
-  explicit ContainsTimesMatcherImpl(InnerMatcher inner_matcher,
-                                    Matcher<size_t> count_matcher)
-      : QuantifierMatcherImpl<Container>(inner_matcher),
-        count_matcher_(std::move(count_matcher)) {}
-
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "quantity of elements that match ";
-    this->inner_matcher_.DescribeTo(os);
-    *os << " ";
-    count_matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "quantity of elements that match ";
-    this->inner_matcher_.DescribeTo(os);
-    *os << " ";
-    count_matcher_.DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(Container container,
-                       MatchResultListener* listener) const override {
-    return this->MatchAndExplainImpl(count_matcher_, container, listener);
-  }
-
- private:
-  const Matcher<size_t> count_matcher_;
-};
-
-// Implements polymorphic Contains(element_matcher).Times(n).
-template <typename M>
-class ContainsTimesMatcher {
- public:
-  explicit ContainsTimesMatcher(M m, Matcher<size_t> count_matcher)
-      : inner_matcher_(m), count_matcher_(std::move(count_matcher)) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {  // NOLINT
-    return Matcher<Container>(new ContainsTimesMatcherImpl<const Container&>(
-        inner_matcher_, count_matcher_));
-  }
-
- private:
-  const M inner_matcher_;
-  const Matcher<size_t> count_matcher_;
-};
-
-// Implements polymorphic Contains(element_matcher).
-template <typename M>
-class ContainsMatcher {
- public:
-  explicit ContainsMatcher(M m) : inner_matcher_(m) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {  // NOLINT
-    return Matcher<Container>(
-        new ContainsMatcherImpl<const Container&>(inner_matcher_));
-  }
-
-  ContainsTimesMatcher<M> Times(Matcher<size_t> count_matcher) const {
-    return ContainsTimesMatcher<M>(inner_matcher_, std::move(count_matcher));
-  }
-
- private:
-  const M inner_matcher_;
-};
-
-// Implements polymorphic Each(element_matcher).
-template <typename M>
-class EachMatcher {
- public:
-  explicit EachMatcher(M m) : inner_matcher_(m) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {  // NOLINT
-    return Matcher<Container>(
-        new EachMatcherImpl<const Container&>(inner_matcher_));
-  }
-
- private:
-  const M inner_matcher_;
-};
-
-struct Rank1 {};
-struct Rank0 : Rank1 {};
-
-namespace pair_getters {
-using std::get;
-template <typename T>
-auto First(T& x, Rank1) -> decltype(get<0>(x)) {  // NOLINT
-  return get<0>(x);
-}
-template <typename T>
-auto First(T& x, Rank0) -> decltype((x.first)) {  // NOLINT
-  return x.first;
-}
-
-template <typename T>
-auto Second(T& x, Rank1) -> decltype(get<1>(x)) {  // NOLINT
-  return get<1>(x);
-}
-template <typename T>
-auto Second(T& x, Rank0) -> decltype((x.second)) {  // NOLINT
-  return x.second;
-}
-}  // namespace pair_getters
-
-// Implements Key(inner_matcher) for the given argument pair type.
-// Key(inner_matcher) matches an std::pair whose 'first' field matches
-// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
-// std::map that contains at least one element whose key is >= 5.
-template <typename PairType>
-class KeyMatcherImpl : public MatcherInterface<PairType> {
- public:
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
-  typedef typename RawPairType::first_type KeyType;
-
-  template <typename InnerMatcher>
-  explicit KeyMatcherImpl(InnerMatcher inner_matcher)
-      : inner_matcher_(
-            testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {}
-
-  // Returns true if and only if 'key_value.first' (the key) matches the inner
-  // matcher.
-  bool MatchAndExplain(PairType key_value,
-                       MatchResultListener* listener) const override {
-    StringMatchResultListener inner_listener;
-    const bool match = inner_matcher_.MatchAndExplain(
-        pair_getters::First(key_value, Rank0()), &inner_listener);
-    const std::string explanation = inner_listener.str();
-    if (explanation != "") {
-      *listener << "whose first field is a value " << explanation;
-    }
-    return match;
-  }
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "has a key that ";
-    inner_matcher_.DescribeTo(os);
-  }
-
-  // Describes what the negation of this matcher does.
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "doesn't have a key that ";
-    inner_matcher_.DescribeTo(os);
-  }
-
- private:
-  const Matcher<const KeyType&> inner_matcher_;
-};
-
-// Implements polymorphic Key(matcher_for_key).
-template <typename M>
-class KeyMatcher {
- public:
-  explicit KeyMatcher(M m) : matcher_for_key_(m) {}
-
-  template <typename PairType>
-  operator Matcher<PairType>() const {
-    return Matcher<PairType>(
-        new KeyMatcherImpl<const PairType&>(matcher_for_key_));
-  }
-
- private:
-  const M matcher_for_key_;
-};
-
-// Implements polymorphic Address(matcher_for_address).
-template <typename InnerMatcher>
-class AddressMatcher {
- public:
-  explicit AddressMatcher(InnerMatcher m) : matcher_(m) {}
-
-  template <typename Type>
-  operator Matcher<Type>() const {  // NOLINT
-    return Matcher<Type>(new Impl<const Type&>(matcher_));
-  }
-
- private:
-  // The monomorphic implementation that works for a particular object type.
-  template <typename Type>
-  class Impl : public MatcherInterface<Type> {
-   public:
-    using Address = const GTEST_REMOVE_REFERENCE_AND_CONST_(Type) *;
-    explicit Impl(const InnerMatcher& matcher)
-        : matcher_(MatcherCast<Address>(matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "has address that ";
-      matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "does not have address that ";
-      matcher_.DescribeTo(os);
-    }
-
-    bool MatchAndExplain(Type object,
-                         MatchResultListener* listener) const override {
-      *listener << "which has address ";
-      Address address = std::addressof(object);
-      return MatchPrintAndExplain(address, matcher_, listener);
-    }
-
-   private:
-    const Matcher<Address> matcher_;
-  };
-  const InnerMatcher matcher_;
-};
-
-// Implements Pair(first_matcher, second_matcher) for the given argument pair
-// type with its two matchers. See Pair() function below.
-template <typename PairType>
-class PairMatcherImpl : public MatcherInterface<PairType> {
- public:
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
-  typedef typename RawPairType::first_type FirstType;
-  typedef typename RawPairType::second_type SecondType;
-
-  template <typename FirstMatcher, typename SecondMatcher>
-  PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
-      : first_matcher_(
-            testing::SafeMatcherCast<const FirstType&>(first_matcher)),
-        second_matcher_(
-            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {}
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "has a first field that ";
-    first_matcher_.DescribeTo(os);
-    *os << ", and has a second field that ";
-    second_matcher_.DescribeTo(os);
-  }
-
-  // Describes what the negation of this matcher does.
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "has a first field that ";
-    first_matcher_.DescribeNegationTo(os);
-    *os << ", or has a second field that ";
-    second_matcher_.DescribeNegationTo(os);
-  }
-
-  // Returns true if and only if 'a_pair.first' matches first_matcher and
-  // 'a_pair.second' matches second_matcher.
-  bool MatchAndExplain(PairType a_pair,
-                       MatchResultListener* listener) const override {
-    if (!listener->IsInterested()) {
-      // If the listener is not interested, we don't need to construct the
-      // explanation.
-      return first_matcher_.Matches(pair_getters::First(a_pair, Rank0())) &&
-             second_matcher_.Matches(pair_getters::Second(a_pair, Rank0()));
-    }
-    StringMatchResultListener first_inner_listener;
-    if (!first_matcher_.MatchAndExplain(pair_getters::First(a_pair, Rank0()),
-                                        &first_inner_listener)) {
-      *listener << "whose first field does not match";
-      PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
-      return false;
-    }
-    StringMatchResultListener second_inner_listener;
-    if (!second_matcher_.MatchAndExplain(pair_getters::Second(a_pair, Rank0()),
-                                         &second_inner_listener)) {
-      *listener << "whose second field does not match";
-      PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
-      return false;
-    }
-    ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
-                   listener);
-    return true;
-  }
-
- private:
-  void ExplainSuccess(const std::string& first_explanation,
-                      const std::string& second_explanation,
-                      MatchResultListener* listener) const {
-    *listener << "whose both fields match";
-    if (first_explanation != "") {
-      *listener << ", where the first field is a value " << first_explanation;
-    }
-    if (second_explanation != "") {
-      *listener << ", ";
-      if (first_explanation != "") {
-        *listener << "and ";
-      } else {
-        *listener << "where ";
-      }
-      *listener << "the second field is a value " << second_explanation;
-    }
-  }
-
-  const Matcher<const FirstType&> first_matcher_;
-  const Matcher<const SecondType&> second_matcher_;
-};
-
-// Implements polymorphic Pair(first_matcher, second_matcher).
-template <typename FirstMatcher, typename SecondMatcher>
-class PairMatcher {
- public:
-  PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
-      : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
-
-  template <typename PairType>
-  operator Matcher<PairType>() const {
-    return Matcher<PairType>(
-        new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
-  }
-
- private:
-  const FirstMatcher first_matcher_;
-  const SecondMatcher second_matcher_;
-};
-
-template <typename T, size_t... I>
-auto UnpackStructImpl(const T& t, IndexSequence<I...>, int)
-    -> decltype(std::tie(get<I>(t)...)) {
-  static_assert(std::tuple_size<T>::value == sizeof...(I),
-                "Number of arguments doesn't match the number of fields.");
-  return std::tie(get<I>(t)...);
-}
-
-#if defined(__cpp_structured_bindings) && __cpp_structured_bindings >= 201606
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<1>, char) {
-  const auto& [a] = t;
-  return std::tie(a);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<2>, char) {
-  const auto& [a, b] = t;
-  return std::tie(a, b);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<3>, char) {
-  const auto& [a, b, c] = t;
-  return std::tie(a, b, c);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<4>, char) {
-  const auto& [a, b, c, d] = t;
-  return std::tie(a, b, c, d);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<5>, char) {
-  const auto& [a, b, c, d, e] = t;
-  return std::tie(a, b, c, d, e);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<6>, char) {
-  const auto& [a, b, c, d, e, f] = t;
-  return std::tie(a, b, c, d, e, f);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<7>, char) {
-  const auto& [a, b, c, d, e, f, g] = t;
-  return std::tie(a, b, c, d, e, f, g);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<8>, char) {
-  const auto& [a, b, c, d, e, f, g, h] = t;
-  return std::tie(a, b, c, d, e, f, g, h);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<9>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<10>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<11>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<12>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k, l] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<13>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<14>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<15>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o);
-}
-template <typename T>
-auto UnpackStructImpl(const T& t, MakeIndexSequence<16>, char) {
-  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p] = t;
-  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
-}
-#endif  // defined(__cpp_structured_bindings)
-
-template <size_t I, typename T>
-auto UnpackStruct(const T& t)
-    -> decltype((UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0)) {
-  return (UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0);
-}
-
-// Helper function to do comma folding in C++11.
-// The array ensures left-to-right order of evaluation.
-// Usage: VariadicExpand({expr...});
-template <typename T, size_t N>
-void VariadicExpand(const T (&)[N]) {}
-
-template <typename Struct, typename StructSize>
-class FieldsAreMatcherImpl;
-
-template <typename Struct, size_t... I>
-class FieldsAreMatcherImpl<Struct, IndexSequence<I...>>
-    : public MatcherInterface<Struct> {
-  using UnpackedType =
-      decltype(UnpackStruct<sizeof...(I)>(std::declval<const Struct&>()));
-  using MatchersType = std::tuple<
-      Matcher<const typename std::tuple_element<I, UnpackedType>::type&>...>;
-
- public:
-  template <typename Inner>
-  explicit FieldsAreMatcherImpl(const Inner& matchers)
-      : matchers_(testing::SafeMatcherCast<
-                  const typename std::tuple_element<I, UnpackedType>::type&>(
-            std::get<I>(matchers))...) {}
-
-  void DescribeTo(::std::ostream* os) const override {
-    const char* separator = "";
-    VariadicExpand(
-        {(*os << separator << "has field #" << I << " that ",
-          std::get<I>(matchers_).DescribeTo(os), separator = ", and ")...});
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    const char* separator = "";
-    VariadicExpand({(*os << separator << "has field #" << I << " that ",
-                     std::get<I>(matchers_).DescribeNegationTo(os),
-                     separator = ", or ")...});
-  }
-
-  bool MatchAndExplain(Struct t, MatchResultListener* listener) const override {
-    return MatchInternal((UnpackStruct<sizeof...(I)>)(t), listener);
-  }
-
- private:
-  bool MatchInternal(UnpackedType tuple, MatchResultListener* listener) const {
-    if (!listener->IsInterested()) {
-      // If the listener is not interested, we don't need to construct the
-      // explanation.
-      bool good = true;
-      VariadicExpand({good = good && std::get<I>(matchers_).Matches(
-                                         std::get<I>(tuple))...});
-      return good;
-    }
-
-    size_t failed_pos = ~size_t{};
-
-    std::vector<StringMatchResultListener> inner_listener(sizeof...(I));
-
-    VariadicExpand(
-        {failed_pos == ~size_t{} && !std::get<I>(matchers_).MatchAndExplain(
-                                        std::get<I>(tuple), &inner_listener[I])
-             ? failed_pos = I
-             : 0 ...});
-    if (failed_pos != ~size_t{}) {
-      *listener << "whose field #" << failed_pos << " does not match";
-      PrintIfNotEmpty(inner_listener[failed_pos].str(), listener->stream());
-      return false;
-    }
-
-    *listener << "whose all elements match";
-    const char* separator = ", where";
-    for (size_t index = 0; index < sizeof...(I); ++index) {
-      const std::string str = inner_listener[index].str();
-      if (!str.empty()) {
-        *listener << separator << " field #" << index << " is a value " << str;
-        separator = ", and";
-      }
-    }
-
-    return true;
-  }
-
-  MatchersType matchers_;
-};
-
-template <typename... Inner>
-class FieldsAreMatcher {
- public:
-  explicit FieldsAreMatcher(Inner... inner) : matchers_(std::move(inner)...) {}
-
-  template <typename Struct>
-  operator Matcher<Struct>() const {  // NOLINT
-    return Matcher<Struct>(
-        new FieldsAreMatcherImpl<const Struct&, IndexSequenceFor<Inner...>>(
-            matchers_));
-  }
-
- private:
-  std::tuple<Inner...> matchers_;
-};
-
-// Implements ElementsAre() and ElementsAreArray().
-template <typename Container>
-class ElementsAreMatcherImpl : public MatcherInterface<Container> {
- public:
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
-  typedef internal::StlContainerView<RawContainer> View;
-  typedef typename View::type StlContainer;
-  typedef typename View::const_reference StlContainerReference;
-  typedef typename StlContainer::value_type Element;
-
-  // Constructs the matcher from a sequence of element values or
-  // element matchers.
-  template <typename InputIter>
-  ElementsAreMatcherImpl(InputIter first, InputIter last) {
-    while (first != last) {
-      matchers_.push_back(MatcherCast<const Element&>(*first++));
-    }
-  }
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    if (count() == 0) {
-      *os << "is empty";
-    } else if (count() == 1) {
-      *os << "has 1 element that ";
-      matchers_[0].DescribeTo(os);
-    } else {
-      *os << "has " << Elements(count()) << " where\n";
-      for (size_t i = 0; i != count(); ++i) {
-        *os << "element #" << i << " ";
-        matchers_[i].DescribeTo(os);
-        if (i + 1 < count()) {
-          *os << ",\n";
-        }
-      }
-    }
-  }
-
-  // Describes what the negation of this matcher does.
-  void DescribeNegationTo(::std::ostream* os) const override {
-    if (count() == 0) {
-      *os << "isn't empty";
-      return;
-    }
-
-    *os << "doesn't have " << Elements(count()) << ", or\n";
-    for (size_t i = 0; i != count(); ++i) {
-      *os << "element #" << i << " ";
-      matchers_[i].DescribeNegationTo(os);
-      if (i + 1 < count()) {
-        *os << ", or\n";
-      }
-    }
-  }
-
-  bool MatchAndExplain(Container container,
-                       MatchResultListener* listener) const override {
-    // To work with stream-like "containers", we must only walk
-    // through the elements in one pass.
-
-    const bool listener_interested = listener->IsInterested();
-
-    // explanations[i] is the explanation of the element at index i.
-    ::std::vector<std::string> explanations(count());
-    StlContainerReference stl_container = View::ConstReference(container);
-    auto it = stl_container.begin();
-    size_t exam_pos = 0;
-    bool mismatch_found = false;  // Have we found a mismatched element yet?
-
-    // Go through the elements and matchers in pairs, until we reach
-    // the end of either the elements or the matchers, or until we find a
-    // mismatch.
-    for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
-      bool match;  // Does the current element match the current matcher?
-      if (listener_interested) {
-        StringMatchResultListener s;
-        match = matchers_[exam_pos].MatchAndExplain(*it, &s);
-        explanations[exam_pos] = s.str();
-      } else {
-        match = matchers_[exam_pos].Matches(*it);
-      }
-
-      if (!match) {
-        mismatch_found = true;
-        break;
-      }
-    }
-    // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
-
-    // Find how many elements the actual container has.  We avoid
-    // calling size() s.t. this code works for stream-like "containers"
-    // that don't define size().
-    size_t actual_count = exam_pos;
-    for (; it != stl_container.end(); ++it) {
-      ++actual_count;
-    }
-
-    if (actual_count != count()) {
-      // The element count doesn't match.  If the container is empty,
-      // there's no need to explain anything as Google Mock already
-      // prints the empty container.  Otherwise we just need to show
-      // how many elements there actually are.
-      if (listener_interested && (actual_count != 0)) {
-        *listener << "which has " << Elements(actual_count);
-      }
-      return false;
-    }
-
-    if (mismatch_found) {
-      // The element count matches, but the exam_pos-th element doesn't match.
-      if (listener_interested) {
-        *listener << "whose element #" << exam_pos << " doesn't match";
-        PrintIfNotEmpty(explanations[exam_pos], listener->stream());
-      }
-      return false;
-    }
-
-    // Every element matches its expectation.  We need to explain why
-    // (the obvious ones can be skipped).
-    if (listener_interested) {
-      bool reason_printed = false;
-      for (size_t i = 0; i != count(); ++i) {
-        const std::string& s = explanations[i];
-        if (!s.empty()) {
-          if (reason_printed) {
-            *listener << ",\nand ";
-          }
-          *listener << "whose element #" << i << " matches, " << s;
-          reason_printed = true;
-        }
-      }
-    }
-    return true;
-  }
-
- private:
-  static Message Elements(size_t count) {
-    return Message() << count << (count == 1 ? " element" : " elements");
-  }
-
-  size_t count() const { return matchers_.size(); }
-
-  ::std::vector<Matcher<const Element&>> matchers_;
-};
-
-// Connectivity matrix of (elements X matchers), in element-major order.
-// Initially, there are no edges.
-// Use NextGraph() to iterate over all possible edge configurations.
-// Use Randomize() to generate a random edge configuration.
-class GTEST_API_ MatchMatrix {
- public:
-  MatchMatrix(size_t num_elements, size_t num_matchers)
-      : num_elements_(num_elements),
-        num_matchers_(num_matchers),
-        matched_(num_elements_ * num_matchers_, 0) {}
-
-  size_t LhsSize() const { return num_elements_; }
-  size_t RhsSize() const { return num_matchers_; }
-  bool HasEdge(size_t ilhs, size_t irhs) const {
-    return matched_[SpaceIndex(ilhs, irhs)] == 1;
-  }
-  void SetEdge(size_t ilhs, size_t irhs, bool b) {
-    matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
-  }
-
-  // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
-  // adds 1 to that number; returns false if incrementing the graph left it
-  // empty.
-  bool NextGraph();
-
-  void Randomize();
-
-  std::string DebugString() const;
-
- private:
-  size_t SpaceIndex(size_t ilhs, size_t irhs) const {
-    return ilhs * num_matchers_ + irhs;
-  }
-
-  size_t num_elements_;
-  size_t num_matchers_;
-
-  // Each element is a char interpreted as bool. They are stored as a
-  // flattened array in lhs-major order, use 'SpaceIndex()' to translate
-  // a (ilhs, irhs) matrix coordinate into an offset.
-  ::std::vector<char> matched_;
-};
-
-typedef ::std::pair<size_t, size_t> ElementMatcherPair;
-typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
-
-// Returns a maximum bipartite matching for the specified graph 'g'.
-// The matching is represented as a vector of {element, matcher} pairs.
-GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g);
-
-struct UnorderedMatcherRequire {
-  enum Flags {
-    Superset = 1 << 0,
-    Subset = 1 << 1,
-    ExactMatch = Superset | Subset,
-  };
-};
-
-// Untyped base class for implementing UnorderedElementsAre.  By
-// putting logic that's not specific to the element type here, we
-// reduce binary bloat and increase compilation speed.
-class GTEST_API_ UnorderedElementsAreMatcherImplBase {
- protected:
-  explicit UnorderedElementsAreMatcherImplBase(
-      UnorderedMatcherRequire::Flags matcher_flags)
-      : match_flags_(matcher_flags) {}
-
-  // A vector of matcher describers, one for each element matcher.
-  // Does not own the describers (and thus can be used only when the
-  // element matchers are alive).
-  typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
-
-  // Describes this UnorderedElementsAre matcher.
-  void DescribeToImpl(::std::ostream* os) const;
-
-  // Describes the negation of this UnorderedElementsAre matcher.
-  void DescribeNegationToImpl(::std::ostream* os) const;
-
-  bool VerifyMatchMatrix(const ::std::vector<std::string>& element_printouts,
-                         const MatchMatrix& matrix,
-                         MatchResultListener* listener) const;
-
-  bool FindPairing(const MatchMatrix& matrix,
-                   MatchResultListener* listener) const;
-
-  MatcherDescriberVec& matcher_describers() { return matcher_describers_; }
-
-  static Message Elements(size_t n) {
-    return Message() << n << " element" << (n == 1 ? "" : "s");
-  }
-
-  UnorderedMatcherRequire::Flags match_flags() const { return match_flags_; }
-
- private:
-  UnorderedMatcherRequire::Flags match_flags_;
-  MatcherDescriberVec matcher_describers_;
-};
-
-// Implements UnorderedElementsAre, UnorderedElementsAreArray, IsSubsetOf, and
-// IsSupersetOf.
-template <typename Container>
-class UnorderedElementsAreMatcherImpl
-    : public MatcherInterface<Container>,
-      public UnorderedElementsAreMatcherImplBase {
- public:
-  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
-  typedef internal::StlContainerView<RawContainer> View;
-  typedef typename View::type StlContainer;
-  typedef typename View::const_reference StlContainerReference;
-  typedef typename StlContainer::value_type Element;
-
-  template <typename InputIter>
-  UnorderedElementsAreMatcherImpl(UnorderedMatcherRequire::Flags matcher_flags,
-                                  InputIter first, InputIter last)
-      : UnorderedElementsAreMatcherImplBase(matcher_flags) {
-    for (; first != last; ++first) {
-      matchers_.push_back(MatcherCast<const Element&>(*first));
-    }
-    for (const auto& m : matchers_) {
-      matcher_describers().push_back(m.GetDescriber());
-    }
-  }
-
-  // Describes what this matcher does.
-  void DescribeTo(::std::ostream* os) const override {
-    return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
-  }
-
-  // Describes what the negation of this matcher does.
-  void DescribeNegationTo(::std::ostream* os) const override {
-    return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
-  }
-
-  bool MatchAndExplain(Container container,
-                       MatchResultListener* listener) const override {
-    StlContainerReference stl_container = View::ConstReference(container);
-    ::std::vector<std::string> element_printouts;
-    MatchMatrix matrix =
-        AnalyzeElements(stl_container.begin(), stl_container.end(),
-                        &element_printouts, listener);
-
-    if (matrix.LhsSize() == 0 && matrix.RhsSize() == 0) {
-      return true;
-    }
-
-    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
-      if (matrix.LhsSize() != matrix.RhsSize()) {
-        // The element count doesn't match.  If the container is empty,
-        // there's no need to explain anything as Google Mock already
-        // prints the empty container. Otherwise we just need to show
-        // how many elements there actually are.
-        if (matrix.LhsSize() != 0 && listener->IsInterested()) {
-          *listener << "which has " << Elements(matrix.LhsSize());
-        }
-        return false;
-      }
-    }
-
-    return VerifyMatchMatrix(element_printouts, matrix, listener) &&
-           FindPairing(matrix, listener);
-  }
-
- private:
-  template <typename ElementIter>
-  MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
-                              ::std::vector<std::string>* element_printouts,
-                              MatchResultListener* listener) const {
-    element_printouts->clear();
-    ::std::vector<char> did_match;
-    size_t num_elements = 0;
-    DummyMatchResultListener dummy;
-    for (; elem_first != elem_last; ++num_elements, ++elem_first) {
-      if (listener->IsInterested()) {
-        element_printouts->push_back(PrintToString(*elem_first));
-      }
-      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
-        did_match.push_back(
-            matchers_[irhs].MatchAndExplain(*elem_first, &dummy));
-      }
-    }
-
-    MatchMatrix matrix(num_elements, matchers_.size());
-    ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
-    for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
-      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
-        matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
-      }
-    }
-    return matrix;
-  }
-
-  ::std::vector<Matcher<const Element&>> matchers_;
-};
-
-// Functor for use in TransformTuple.
-// Performs MatcherCast<Target> on an input argument of any type.
-template <typename Target>
-struct CastAndAppendTransform {
-  template <typename Arg>
-  Matcher<Target> operator()(const Arg& a) const {
-    return MatcherCast<Target>(a);
-  }
-};
-
-// Implements UnorderedElementsAre.
-template <typename MatcherTuple>
-class UnorderedElementsAreMatcher {
- public:
-  explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
-      : matchers_(args) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
-    typedef typename internal::StlContainerView<RawContainer>::type View;
-    typedef typename View::value_type Element;
-    typedef ::std::vector<Matcher<const Element&>> MatcherVec;
-    MatcherVec matchers;
-    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
-    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
-                         ::std::back_inserter(matchers));
-    return Matcher<Container>(
-        new UnorderedElementsAreMatcherImpl<const Container&>(
-            UnorderedMatcherRequire::ExactMatch, matchers.begin(),
-            matchers.end()));
-  }
-
- private:
-  const MatcherTuple matchers_;
-};
-
-// Implements ElementsAre.
-template <typename MatcherTuple>
-class ElementsAreMatcher {
- public:
-  explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    static_assert(
-        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
-            ::std::tuple_size<MatcherTuple>::value < 2,
-        "use UnorderedElementsAre with hash tables");
-
-    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
-    typedef typename internal::StlContainerView<RawContainer>::type View;
-    typedef typename View::value_type Element;
-    typedef ::std::vector<Matcher<const Element&>> MatcherVec;
-    MatcherVec matchers;
-    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
-    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
-                         ::std::back_inserter(matchers));
-    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
-        matchers.begin(), matchers.end()));
-  }
-
- private:
-  const MatcherTuple matchers_;
-};
-
-// Implements UnorderedElementsAreArray(), IsSubsetOf(), and IsSupersetOf().
-template <typename T>
-class UnorderedElementsAreArrayMatcher {
- public:
-  template <typename Iter>
-  UnorderedElementsAreArrayMatcher(UnorderedMatcherRequire::Flags match_flags,
-                                   Iter first, Iter last)
-      : match_flags_(match_flags), matchers_(first, last) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    return Matcher<Container>(
-        new UnorderedElementsAreMatcherImpl<const Container&>(
-            match_flags_, matchers_.begin(), matchers_.end()));
-  }
-
- private:
-  UnorderedMatcherRequire::Flags match_flags_;
-  ::std::vector<T> matchers_;
-};
-
-// Implements ElementsAreArray().
-template <typename T>
-class ElementsAreArrayMatcher {
- public:
-  template <typename Iter>
-  ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
-
-  template <typename Container>
-  operator Matcher<Container>() const {
-    static_assert(
-        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
-        "use UnorderedElementsAreArray with hash tables");
-
-    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
-        matchers_.begin(), matchers_.end()));
-  }
-
- private:
-  const ::std::vector<T> matchers_;
-};
-
-// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
-// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
-// second) is a polymorphic matcher that matches a value x if and only if
-// tm matches tuple (x, second).  Useful for implementing
-// UnorderedPointwise() in terms of UnorderedElementsAreArray().
-//
-// BoundSecondMatcher is copyable and assignable, as we need to put
-// instances of this class in a vector when implementing
-// UnorderedPointwise().
-template <typename Tuple2Matcher, typename Second>
-class BoundSecondMatcher {
- public:
-  BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
-      : tuple2_matcher_(tm), second_value_(second) {}
-
-  BoundSecondMatcher(const BoundSecondMatcher& other) = default;
-
-  template <typename T>
-  operator Matcher<T>() const {
-    return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
-  }
-
-  // We have to define this for UnorderedPointwise() to compile in
-  // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
-  // which requires the elements to be assignable in C++98.  The
-  // compiler cannot generate the operator= for us, as Tuple2Matcher
-  // and Second may not be assignable.
-  //
-  // However, this should never be called, so the implementation just
-  // need to assert.
-  void operator=(const BoundSecondMatcher& /*rhs*/) {
-    GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
-  }
-
- private:
-  template <typename T>
-  class Impl : public MatcherInterface<T> {
-   public:
-    typedef ::std::tuple<T, Second> ArgTuple;
-
-    Impl(const Tuple2Matcher& tm, const Second& second)
-        : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
-          second_value_(second) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "and ";
-      UniversalPrint(second_value_, os);
-      *os << " ";
-      mono_tuple2_matcher_.DescribeTo(os);
-    }
-
-    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
-      return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
-                                                  listener);
-    }
-
-   private:
-    const Matcher<const ArgTuple&> mono_tuple2_matcher_;
-    const Second second_value_;
-  };
-
-  const Tuple2Matcher tuple2_matcher_;
-  const Second second_value_;
-};
-
-// Given a 2-tuple matcher tm and a value second,
-// MatcherBindSecond(tm, second) returns a matcher that matches a
-// value x if and only if tm matches tuple (x, second).  Useful for
-// implementing UnorderedPointwise() in terms of UnorderedElementsAreArray().
-template <typename Tuple2Matcher, typename Second>
-BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
-    const Tuple2Matcher& tm, const Second& second) {
-  return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
-}
-
-// Returns the description for a matcher defined using the MATCHER*()
-// macro where the user-supplied description string is "", if
-// 'negation' is false; otherwise returns the description of the
-// negation of the matcher.  'param_values' contains a list of strings
-// that are the print-out of the matcher's parameters.
-GTEST_API_ std::string FormatMatcherDescription(
-    bool negation, const char* matcher_name,
-    const std::vector<const char*>& param_names, const Strings& param_values);
-
-// Implements a matcher that checks the value of a optional<> type variable.
-template <typename ValueMatcher>
-class OptionalMatcher {
- public:
-  explicit OptionalMatcher(const ValueMatcher& value_matcher)
-      : value_matcher_(value_matcher) {}
-
-  template <typename Optional>
-  operator Matcher<Optional>() const {
-    return Matcher<Optional>(new Impl<const Optional&>(value_matcher_));
-  }
-
-  template <typename Optional>
-  class Impl : public MatcherInterface<Optional> {
-   public:
-    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Optional) OptionalView;
-    typedef typename OptionalView::value_type ValueType;
-    explicit Impl(const ValueMatcher& value_matcher)
-        : value_matcher_(MatcherCast<ValueType>(value_matcher)) {}
-
-    void DescribeTo(::std::ostream* os) const override {
-      *os << "value ";
-      value_matcher_.DescribeTo(os);
-    }
-
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << "value ";
-      value_matcher_.DescribeNegationTo(os);
-    }
-
-    bool MatchAndExplain(Optional optional,
-                         MatchResultListener* listener) const override {
-      if (!optional) {
-        *listener << "which is not engaged";
-        return false;
-      }
-      const ValueType& value = *optional;
-      StringMatchResultListener value_listener;
-      const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
-      *listener << "whose value " << PrintToString(value)
-                << (match ? " matches" : " doesn't match");
-      PrintIfNotEmpty(value_listener.str(), listener->stream());
-      return match;
-    }
-
-   private:
-    const Matcher<ValueType> value_matcher_;
-  };
-
- private:
-  const ValueMatcher value_matcher_;
-};
-
-namespace variant_matcher {
-// Overloads to allow VariantMatcher to do proper ADL lookup.
-template <typename T>
-void holds_alternative() {}
-template <typename T>
-void get() {}
-
-// Implements a matcher that checks the value of a variant<> type variable.
-template <typename T>
-class VariantMatcher {
- public:
-  explicit VariantMatcher(::testing::Matcher<const T&> matcher)
-      : matcher_(std::move(matcher)) {}
-
-  template <typename Variant>
-  bool MatchAndExplain(const Variant& value,
-                       ::testing::MatchResultListener* listener) const {
-    using std::get;
-    if (!listener->IsInterested()) {
-      return holds_alternative<T>(value) && matcher_.Matches(get<T>(value));
-    }
-
-    if (!holds_alternative<T>(value)) {
-      *listener << "whose value is not of type '" << GetTypeName() << "'";
-      return false;
-    }
-
-    const T& elem = get<T>(value);
-    StringMatchResultListener elem_listener;
-    const bool match = matcher_.MatchAndExplain(elem, &elem_listener);
-    *listener << "whose value " << PrintToString(elem)
-              << (match ? " matches" : " doesn't match");
-    PrintIfNotEmpty(elem_listener.str(), listener->stream());
-    return match;
-  }
-
-  void DescribeTo(std::ostream* os) const {
-    *os << "is a variant<> with value of type '" << GetTypeName()
-        << "' and the value ";
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(std::ostream* os) const {
-    *os << "is a variant<> with value of type other than '" << GetTypeName()
-        << "' or the value ";
-    matcher_.DescribeNegationTo(os);
-  }
-
- private:
-  static std::string GetTypeName() {
-#if GTEST_HAS_RTTI
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
-        return internal::GetTypeName<T>());
-#endif
-    return "the element type";
-  }
-
-  const ::testing::Matcher<const T&> matcher_;
-};
-
-}  // namespace variant_matcher
-
-namespace any_cast_matcher {
-
-// Overloads to allow AnyCastMatcher to do proper ADL lookup.
-template <typename T>
-void any_cast() {}
-
-// Implements a matcher that any_casts the value.
-template <typename T>
-class AnyCastMatcher {
- public:
-  explicit AnyCastMatcher(const ::testing::Matcher<const T&>& matcher)
-      : matcher_(matcher) {}
-
-  template <typename AnyType>
-  bool MatchAndExplain(const AnyType& value,
-                       ::testing::MatchResultListener* listener) const {
-    if (!listener->IsInterested()) {
-      const T* ptr = any_cast<T>(&value);
-      return ptr != nullptr && matcher_.Matches(*ptr);
-    }
-
-    const T* elem = any_cast<T>(&value);
-    if (elem == nullptr) {
-      *listener << "whose value is not of type '" << GetTypeName() << "'";
-      return false;
-    }
-
-    StringMatchResultListener elem_listener;
-    const bool match = matcher_.MatchAndExplain(*elem, &elem_listener);
-    *listener << "whose value " << PrintToString(*elem)
-              << (match ? " matches" : " doesn't match");
-    PrintIfNotEmpty(elem_listener.str(), listener->stream());
-    return match;
-  }
-
-  void DescribeTo(std::ostream* os) const {
-    *os << "is an 'any' type with value of type '" << GetTypeName()
-        << "' and the value ";
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(std::ostream* os) const {
-    *os << "is an 'any' type with value of type other than '" << GetTypeName()
-        << "' or the value ";
-    matcher_.DescribeNegationTo(os);
-  }
-
- private:
-  static std::string GetTypeName() {
-#if GTEST_HAS_RTTI
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
-        return internal::GetTypeName<T>());
-#endif
-    return "the element type";
-  }
-
-  const ::testing::Matcher<const T&> matcher_;
-};
-
-}  // namespace any_cast_matcher
-
-// Implements the Args() matcher.
-template <class ArgsTuple, size_t... k>
-class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
- public:
-  using RawArgsTuple = typename std::decay<ArgsTuple>::type;
-  using SelectedArgs =
-      std::tuple<typename std::tuple_element<k, RawArgsTuple>::type...>;
-  using MonomorphicInnerMatcher = Matcher<const SelectedArgs&>;
-
-  template <typename InnerMatcher>
-  explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
-      : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
-
-  bool MatchAndExplain(ArgsTuple args,
-                       MatchResultListener* listener) const override {
-    // Workaround spurious C4100 on MSVC<=15.7 when k is empty.
-    (void)args;
-    const SelectedArgs& selected_args =
-        std::forward_as_tuple(std::get<k>(args)...);
-    if (!listener->IsInterested()) return inner_matcher_.Matches(selected_args);
-
-    PrintIndices(listener->stream());
-    *listener << "are " << PrintToString(selected_args);
-
-    StringMatchResultListener inner_listener;
-    const bool match =
-        inner_matcher_.MatchAndExplain(selected_args, &inner_listener);
-    PrintIfNotEmpty(inner_listener.str(), listener->stream());
-    return match;
-  }
-
-  void DescribeTo(::std::ostream* os) const override {
-    *os << "are a tuple ";
-    PrintIndices(os);
-    inner_matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    *os << "are a tuple ";
-    PrintIndices(os);
-    inner_matcher_.DescribeNegationTo(os);
-  }
-
- private:
-  // Prints the indices of the selected fields.
-  static void PrintIndices(::std::ostream* os) {
-    *os << "whose fields (";
-    const char* sep = "";
-    // Workaround spurious C4189 on MSVC<=15.7 when k is empty.
-    (void)sep;
-    const char* dummy[] = {"", (*os << sep << "#" << k, sep = ", ")...};
-    (void)dummy;
-    *os << ") ";
-  }
-
-  MonomorphicInnerMatcher inner_matcher_;
-};
-
-template <class InnerMatcher, size_t... k>
-class ArgsMatcher {
- public:
-  explicit ArgsMatcher(InnerMatcher inner_matcher)
-      : inner_matcher_(std::move(inner_matcher)) {}
-
-  template <typename ArgsTuple>
-  operator Matcher<ArgsTuple>() const {  // NOLINT
-    return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k...>(inner_matcher_));
-  }
-
- private:
-  InnerMatcher inner_matcher_;
-};
-
-}  // namespace internal
-
-// ElementsAreArray(iterator_first, iterator_last)
-// ElementsAreArray(pointer, count)
-// ElementsAreArray(array)
-// ElementsAreArray(container)
-// ElementsAreArray({ e1, e2, ..., en })
-//
-// The ElementsAreArray() functions are like ElementsAre(...), except
-// that they are given a homogeneous sequence rather than taking each
-// element as a function argument. The sequence can be specified as an
-// array, a pointer and count, a vector, an initializer list, or an
-// STL iterator range. In each of these cases, the underlying sequence
-// can be either a sequence of values or a sequence of matchers.
-//
-// All forms of ElementsAreArray() make a copy of the input matcher sequence.
-
-template <typename Iter>
-inline internal::ElementsAreArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-ElementsAreArray(Iter first, Iter last) {
-  typedef typename ::std::iterator_traits<Iter>::value_type T;
-  return internal::ElementsAreArrayMatcher<T>(first, last);
-}
-
-template <typename T>
-inline auto ElementsAreArray(const T* pointer, size_t count)
-    -> decltype(ElementsAreArray(pointer, pointer + count)) {
-  return ElementsAreArray(pointer, pointer + count);
-}
-
-template <typename T, size_t N>
-inline auto ElementsAreArray(const T (&array)[N])
-    -> decltype(ElementsAreArray(array, N)) {
-  return ElementsAreArray(array, N);
-}
-
-template <typename Container>
-inline auto ElementsAreArray(const Container& container)
-    -> decltype(ElementsAreArray(container.begin(), container.end())) {
-  return ElementsAreArray(container.begin(), container.end());
-}
-
-template <typename T>
-inline auto ElementsAreArray(::std::initializer_list<T> xs)
-    -> decltype(ElementsAreArray(xs.begin(), xs.end())) {
-  return ElementsAreArray(xs.begin(), xs.end());
-}
-
-// UnorderedElementsAreArray(iterator_first, iterator_last)
-// UnorderedElementsAreArray(pointer, count)
-// UnorderedElementsAreArray(array)
-// UnorderedElementsAreArray(container)
-// UnorderedElementsAreArray({ e1, e2, ..., en })
-//
-// UnorderedElementsAreArray() verifies that a bijective mapping onto a
-// collection of matchers exists.
-//
-// The matchers can be specified as an array, a pointer and count, a container,
-// an initializer list, or an STL iterator range. In each of these cases, the
-// underlying matchers can be either values or matchers.
-
-template <typename Iter>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-UnorderedElementsAreArray(Iter first, Iter last) {
-  typedef typename ::std::iterator_traits<Iter>::value_type T;
-  return internal::UnorderedElementsAreArrayMatcher<T>(
-      internal::UnorderedMatcherRequire::ExactMatch, first, last);
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
-    const T* pointer, size_t count) {
-  return UnorderedElementsAreArray(pointer, pointer + count);
-}
-
-template <typename T, size_t N>
-inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
-    const T (&array)[N]) {
-  return UnorderedElementsAreArray(array, N);
-}
-
-template <typename Container>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename Container::value_type>
-UnorderedElementsAreArray(const Container& container) {
-  return UnorderedElementsAreArray(container.begin(), container.end());
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
-    ::std::initializer_list<T> xs) {
-  return UnorderedElementsAreArray(xs.begin(), xs.end());
-}
-
-// _ is a matcher that matches anything of any type.
-//
-// This definition is fine as:
-//
-//   1. The C++ standard permits using the name _ in a namespace that
-//      is not the global namespace or ::std.
-//   2. The AnythingMatcher class has no data member or constructor,
-//      so it's OK to create global variables of this type.
-//   3. c-style has approved of using _ in this case.
-const internal::AnythingMatcher _ = {};
-// Creates a matcher that matches any value of the given type T.
-template <typename T>
-inline Matcher<T> A() {
-  return _;
-}
-
-// Creates a matcher that matches any value of the given type T.
-template <typename T>
-inline Matcher<T> An() {
-  return _;
-}
-
-template <typename T, typename M>
-Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
-    const M& value, std::false_type /* convertible_to_matcher */,
-    std::false_type /* convertible_to_T */) {
-  return Eq(value);
-}
-
-// Creates a polymorphic matcher that matches any NULL pointer.
-inline PolymorphicMatcher<internal::IsNullMatcher> IsNull() {
-  return MakePolymorphicMatcher(internal::IsNullMatcher());
-}
-
-// Creates a polymorphic matcher that matches any non-NULL pointer.
-// This is convenient as Not(NULL) doesn't compile (the compiler
-// thinks that that expression is comparing a pointer with an integer).
-inline PolymorphicMatcher<internal::NotNullMatcher> NotNull() {
-  return MakePolymorphicMatcher(internal::NotNullMatcher());
-}
-
-// Creates a polymorphic matcher that matches any argument that
-// references variable x.
-template <typename T>
-inline internal::RefMatcher<T&> Ref(T& x) {  // NOLINT
-  return internal::RefMatcher<T&>(x);
-}
-
-// Creates a polymorphic matcher that matches any NaN floating point.
-inline PolymorphicMatcher<internal::IsNanMatcher> IsNan() {
-  return MakePolymorphicMatcher(internal::IsNanMatcher());
-}
-
-// Creates a matcher that matches any double argument approximately
-// equal to rhs, where two NANs are considered unequal.
-inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
-  return internal::FloatingEqMatcher<double>(rhs, false);
-}
-
-// Creates a matcher that matches any double argument approximately
-// equal to rhs, including NaN values when rhs is NaN.
-inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
-  return internal::FloatingEqMatcher<double>(rhs, true);
-}
-
-// Creates a matcher that matches any double argument approximately equal to
-// rhs, up to the specified max absolute error bound, where two NANs are
-// considered unequal.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<double> DoubleNear(double rhs,
-                                                      double max_abs_error) {
-  return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
-}
-
-// Creates a matcher that matches any double argument approximately equal to
-// rhs, up to the specified max absolute error bound, including NaN values when
-// rhs is NaN.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
-    double rhs, double max_abs_error) {
-  return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
-}
-
-// Creates a matcher that matches any float argument approximately
-// equal to rhs, where two NANs are considered unequal.
-inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
-  return internal::FloatingEqMatcher<float>(rhs, false);
-}
-
-// Creates a matcher that matches any float argument approximately
-// equal to rhs, including NaN values when rhs is NaN.
-inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
-  return internal::FloatingEqMatcher<float>(rhs, true);
-}
-
-// Creates a matcher that matches any float argument approximately equal to
-// rhs, up to the specified max absolute error bound, where two NANs are
-// considered unequal.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<float> FloatNear(float rhs,
-                                                    float max_abs_error) {
-  return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
-}
-
-// Creates a matcher that matches any float argument approximately equal to
-// rhs, up to the specified max absolute error bound, including NaN values when
-// rhs is NaN.  The max absolute error bound must be non-negative.
-inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
-    float rhs, float max_abs_error) {
-  return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
-}
-
-// Creates a matcher that matches a pointer (raw or smart) that points
-// to a value that matches inner_matcher.
-template <typename InnerMatcher>
-inline internal::PointeeMatcher<InnerMatcher> Pointee(
-    const InnerMatcher& inner_matcher) {
-  return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
-}
-
-#if GTEST_HAS_RTTI
-// Creates a matcher that matches a pointer or reference that matches
-// inner_matcher when dynamic_cast<To> is applied.
-// The result of dynamic_cast<To> is forwarded to the inner matcher.
-// If To is a pointer and the cast fails, the inner matcher will receive NULL.
-// If To is a reference and the cast fails, this matcher returns false
-// immediately.
-template <typename To>
-inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To>>
-WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
-  return MakePolymorphicMatcher(
-      internal::WhenDynamicCastToMatcher<To>(inner_matcher));
-}
-#endif  // GTEST_HAS_RTTI
-
-// Creates a matcher that matches an object whose given field matches
-// 'matcher'.  For example,
-//   Field(&Foo::number, Ge(5))
-// matches a Foo object x if and only if x.number >= 5.
-template <typename Class, typename FieldType, typename FieldMatcher>
-inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
-    FieldType Class::*field, const FieldMatcher& matcher) {
-  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
-      field, MatcherCast<const FieldType&>(matcher)));
-  // The call to MatcherCast() is required for supporting inner
-  // matchers of compatible types.  For example, it allows
-  //   Field(&Foo::bar, m)
-  // to compile where bar is an int32 and m is a matcher for int64.
-}
-
-// Same as Field() but also takes the name of the field to provide better error
-// messages.
-template <typename Class, typename FieldType, typename FieldMatcher>
-inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
-    const std::string& field_name, FieldType Class::*field,
-    const FieldMatcher& matcher) {
-  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
-      field_name, field, MatcherCast<const FieldType&>(matcher)));
-}
-
-// Creates a matcher that matches an object whose given property
-// matches 'matcher'.  For example,
-//   Property(&Foo::str, StartsWith("hi"))
-// matches a Foo object x if and only if x.str() starts with "hi".
-template <typename Class, typename PropertyType, typename PropertyMatcher>
-inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const>>
-Property(PropertyType (Class::*property)() const,
-         const PropertyMatcher& matcher) {
-  return MakePolymorphicMatcher(
-      internal::PropertyMatcher<Class, PropertyType,
-                                PropertyType (Class::*)() const>(
-          property, MatcherCast<const PropertyType&>(matcher)));
-  // The call to MatcherCast() is required for supporting inner
-  // matchers of compatible types.  For example, it allows
-  //   Property(&Foo::bar, m)
-  // to compile where bar() returns an int32 and m is a matcher for int64.
-}
-
-// Same as Property() above, but also takes the name of the property to provide
-// better error messages.
-template <typename Class, typename PropertyType, typename PropertyMatcher>
-inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const>>
-Property(const std::string& property_name,
-         PropertyType (Class::*property)() const,
-         const PropertyMatcher& matcher) {
-  return MakePolymorphicMatcher(
-      internal::PropertyMatcher<Class, PropertyType,
-                                PropertyType (Class::*)() const>(
-          property_name, property, MatcherCast<const PropertyType&>(matcher)));
-}
-
-// The same as above but for reference-qualified member functions.
-template <typename Class, typename PropertyType, typename PropertyMatcher>
-inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const&>>
-Property(PropertyType (Class::*property)() const&,
-         const PropertyMatcher& matcher) {
-  return MakePolymorphicMatcher(
-      internal::PropertyMatcher<Class, PropertyType,
-                                PropertyType (Class::*)() const&>(
-          property, MatcherCast<const PropertyType&>(matcher)));
-}
-
-// Three-argument form for reference-qualified member functions.
-template <typename Class, typename PropertyType, typename PropertyMatcher>
-inline PolymorphicMatcher<internal::PropertyMatcher<
-    Class, PropertyType, PropertyType (Class::*)() const&>>
-Property(const std::string& property_name,
-         PropertyType (Class::*property)() const&,
-         const PropertyMatcher& matcher) {
-  return MakePolymorphicMatcher(
-      internal::PropertyMatcher<Class, PropertyType,
-                                PropertyType (Class::*)() const&>(
-          property_name, property, MatcherCast<const PropertyType&>(matcher)));
-}
-
-// Creates a matcher that matches an object if and only if the result of
-// applying a callable to x matches 'matcher'. For example,
-//   ResultOf(f, StartsWith("hi"))
-// matches a Foo object x if and only if f(x) starts with "hi".
-// `callable` parameter can be a function, function pointer, or a functor. It is
-// required to keep no state affecting the results of the calls on it and make
-// no assumptions about how many calls will be made. Any state it keeps must be
-// protected from the concurrent access.
-template <typename Callable, typename InnerMatcher>
-internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
-    Callable callable, InnerMatcher matcher) {
-  return internal::ResultOfMatcher<Callable, InnerMatcher>(std::move(callable),
-                                                           std::move(matcher));
-}
-
-// Same as ResultOf() above, but also takes a description of the `callable`
-// result to provide better error messages.
-template <typename Callable, typename InnerMatcher>
-internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
-    const std::string& result_description, Callable callable,
-    InnerMatcher matcher) {
-  return internal::ResultOfMatcher<Callable, InnerMatcher>(
-      result_description, std::move(callable), std::move(matcher));
-}
-
-// String matchers.
-
-// Matches a string equal to str.
-template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrEq(
-    const internal::StringLike<T>& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::string>(std::string(str), true, true));
-}
-
-// Matches a string not equal to str.
-template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrNe(
-    const internal::StringLike<T>& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::string>(std::string(str), false, true));
-}
-
-// Matches a string equal to str, ignoring case.
-template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseEq(
-    const internal::StringLike<T>& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::string>(std::string(str), true, false));
-}
-
-// Matches a string not equal to str, ignoring case.
-template <typename T = std::string>
-PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseNe(
-    const internal::StringLike<T>& str) {
-  return MakePolymorphicMatcher(internal::StrEqualityMatcher<std::string>(
-      std::string(str), false, false));
-}
-
-// Creates a matcher that matches any string, std::string, or C string
-// that contains the given substring.
-template <typename T = std::string>
-PolymorphicMatcher<internal::HasSubstrMatcher<std::string>> HasSubstr(
-    const internal::StringLike<T>& substring) {
-  return MakePolymorphicMatcher(
-      internal::HasSubstrMatcher<std::string>(std::string(substring)));
-}
-
-// Matches a string that starts with 'prefix' (case-sensitive).
-template <typename T = std::string>
-PolymorphicMatcher<internal::StartsWithMatcher<std::string>> StartsWith(
-    const internal::StringLike<T>& prefix) {
-  return MakePolymorphicMatcher(
-      internal::StartsWithMatcher<std::string>(std::string(prefix)));
-}
-
-// Matches a string that ends with 'suffix' (case-sensitive).
-template <typename T = std::string>
-PolymorphicMatcher<internal::EndsWithMatcher<std::string>> EndsWith(
-    const internal::StringLike<T>& suffix) {
-  return MakePolymorphicMatcher(
-      internal::EndsWithMatcher<std::string>(std::string(suffix)));
-}
-
-#if GTEST_HAS_STD_WSTRING
-// Wide string matchers.
-
-// Matches a string equal to str.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrEq(
-    const std::wstring& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::wstring>(str, true, true));
-}
-
-// Matches a string not equal to str.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrNe(
-    const std::wstring& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::wstring>(str, false, true));
-}
-
-// Matches a string equal to str, ignoring case.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseEq(
-    const std::wstring& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::wstring>(str, true, false));
-}
-
-// Matches a string not equal to str, ignoring case.
-inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseNe(
-    const std::wstring& str) {
-  return MakePolymorphicMatcher(
-      internal::StrEqualityMatcher<std::wstring>(str, false, false));
-}
-
-// Creates a matcher that matches any ::wstring, std::wstring, or C wide string
-// that contains the given substring.
-inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring>> HasSubstr(
-    const std::wstring& substring) {
-  return MakePolymorphicMatcher(
-      internal::HasSubstrMatcher<std::wstring>(substring));
-}
-
-// Matches a string that starts with 'prefix' (case-sensitive).
-inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring>> StartsWith(
-    const std::wstring& prefix) {
-  return MakePolymorphicMatcher(
-      internal::StartsWithMatcher<std::wstring>(prefix));
-}
-
-// Matches a string that ends with 'suffix' (case-sensitive).
-inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring>> EndsWith(
-    const std::wstring& suffix) {
-  return MakePolymorphicMatcher(
-      internal::EndsWithMatcher<std::wstring>(suffix));
-}
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field == the second field.
-inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field >= the second field.
-inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field > the second field.
-inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field <= the second field.
-inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field < the second field.
-inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where the
-// first field != the second field.
-inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// FloatEq(first field) matches the second field.
-inline internal::FloatingEq2Matcher<float> FloatEq() {
-  return internal::FloatingEq2Matcher<float>();
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// DoubleEq(first field) matches the second field.
-inline internal::FloatingEq2Matcher<double> DoubleEq() {
-  return internal::FloatingEq2Matcher<double>();
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// FloatEq(first field) matches the second field with NaN equality.
-inline internal::FloatingEq2Matcher<float> NanSensitiveFloatEq() {
-  return internal::FloatingEq2Matcher<float>(true);
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// DoubleEq(first field) matches the second field with NaN equality.
-inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleEq() {
-  return internal::FloatingEq2Matcher<double>(true);
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// FloatNear(first field, max_abs_error) matches the second field.
-inline internal::FloatingEq2Matcher<float> FloatNear(float max_abs_error) {
-  return internal::FloatingEq2Matcher<float>(max_abs_error);
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// DoubleNear(first field, max_abs_error) matches the second field.
-inline internal::FloatingEq2Matcher<double> DoubleNear(double max_abs_error) {
-  return internal::FloatingEq2Matcher<double>(max_abs_error);
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// FloatNear(first field, max_abs_error) matches the second field with NaN
-// equality.
-inline internal::FloatingEq2Matcher<float> NanSensitiveFloatNear(
-    float max_abs_error) {
-  return internal::FloatingEq2Matcher<float>(max_abs_error, true);
-}
-
-// Creates a polymorphic matcher that matches a 2-tuple where
-// DoubleNear(first field, max_abs_error) matches the second field with NaN
-// equality.
-inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleNear(
-    double max_abs_error) {
-  return internal::FloatingEq2Matcher<double>(max_abs_error, true);
-}
-
-// Creates a matcher that matches any value of type T that m doesn't
-// match.
-template <typename InnerMatcher>
-inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
-  return internal::NotMatcher<InnerMatcher>(m);
-}
-
-// Returns a matcher that matches anything that satisfies the given
-// predicate.  The predicate can be any unary function or functor
-// whose return type can be implicitly converted to bool.
-template <typename Predicate>
-inline PolymorphicMatcher<internal::TrulyMatcher<Predicate>> Truly(
-    Predicate pred) {
-  return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
-}
-
-// Returns a matcher that matches the container size. The container must
-// support both size() and size_type which all STL-like containers provide.
-// Note that the parameter 'size' can be a value of type size_type as well as
-// matcher. For instance:
-//   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
-//   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
-template <typename SizeMatcher>
-inline internal::SizeIsMatcher<SizeMatcher> SizeIs(
-    const SizeMatcher& size_matcher) {
-  return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
-}
-
-// Returns a matcher that matches the distance between the container's begin()
-// iterator and its end() iterator, i.e. the size of the container. This matcher
-// can be used instead of SizeIs with containers such as std::forward_list which
-// do not implement size(). The container must provide const_iterator (with
-// valid iterator_traits), begin() and end().
-template <typename DistanceMatcher>
-inline internal::BeginEndDistanceIsMatcher<DistanceMatcher> BeginEndDistanceIs(
-    const DistanceMatcher& distance_matcher) {
-  return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
-}
-
-// Returns a matcher that matches an equal container.
-// This matcher behaves like Eq(), but in the event of mismatch lists the
-// values that are included in one container but not the other. (Duplicate
-// values and order differences are not explained.)
-template <typename Container>
-inline PolymorphicMatcher<
-    internal::ContainerEqMatcher<typename std::remove_const<Container>::type>>
-ContainerEq(const Container& rhs) {
-  return MakePolymorphicMatcher(internal::ContainerEqMatcher<Container>(rhs));
-}
-
-// Returns a matcher that matches a container that, when sorted using
-// the given comparator, matches container_matcher.
-template <typename Comparator, typename ContainerMatcher>
-inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher> WhenSortedBy(
-    const Comparator& comparator, const ContainerMatcher& container_matcher) {
-  return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
-      comparator, container_matcher);
-}
-
-// Returns a matcher that matches a container that, when sorted using
-// the < operator, matches container_matcher.
-template <typename ContainerMatcher>
-inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
-WhenSorted(const ContainerMatcher& container_matcher) {
-  return internal::WhenSortedByMatcher<internal::LessComparator,
-                                       ContainerMatcher>(
-      internal::LessComparator(), container_matcher);
-}
-
-// Matches an STL-style container or a native array that contains the
-// same number of elements as in rhs, where its i-th element and rhs's
-// i-th element (as a pair) satisfy the given pair matcher, for all i.
-// TupleMatcher must be able to be safely cast to Matcher<std::tuple<const
-// T1&, const T2&> >, where T1 and T2 are the types of elements in the
-// LHS container and the RHS container respectively.
-template <typename TupleMatcher, typename Container>
-inline internal::PointwiseMatcher<TupleMatcher,
-                                  typename std::remove_const<Container>::type>
-Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
-  return internal::PointwiseMatcher<TupleMatcher, Container>(tuple_matcher,
-                                                             rhs);
-}
-
-// Supports the Pointwise(m, {a, b, c}) syntax.
-template <typename TupleMatcher, typename T>
-inline internal::PointwiseMatcher<TupleMatcher, std::vector<T>> Pointwise(
-    const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
-  return Pointwise(tuple_matcher, std::vector<T>(rhs));
-}
-
-// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
-// container or a native array that contains the same number of
-// elements as in rhs, where in some permutation of the container, its
-// i-th element and rhs's i-th element (as a pair) satisfy the given
-// pair matcher, for all i.  Tuple2Matcher must be able to be safely
-// cast to Matcher<std::tuple<const T1&, const T2&> >, where T1 and T2 are
-// the types of elements in the LHS container and the RHS container
-// respectively.
-//
-// This is like Pointwise(pair_matcher, rhs), except that the element
-// order doesn't matter.
-template <typename Tuple2Matcher, typename RhsContainer>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename internal::BoundSecondMatcher<
-        Tuple2Matcher,
-        typename internal::StlContainerView<
-            typename std::remove_const<RhsContainer>::type>::type::value_type>>
-UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
-                   const RhsContainer& rhs_container) {
-  // RhsView allows the same code to handle RhsContainer being a
-  // STL-style container and it being a native C-style array.
-  typedef typename internal::StlContainerView<RhsContainer> RhsView;
-  typedef typename RhsView::type RhsStlContainer;
-  typedef typename RhsStlContainer::value_type Second;
-  const RhsStlContainer& rhs_stl_container =
-      RhsView::ConstReference(rhs_container);
-
-  // Create a matcher for each element in rhs_container.
-  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second>> matchers;
-  for (auto it = rhs_stl_container.begin(); it != rhs_stl_container.end();
-       ++it) {
-    matchers.push_back(internal::MatcherBindSecond(tuple2_matcher, *it));
-  }
-
-  // Delegate the work to UnorderedElementsAreArray().
-  return UnorderedElementsAreArray(matchers);
-}
-
-// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
-template <typename Tuple2Matcher, typename T>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename internal::BoundSecondMatcher<Tuple2Matcher, T>>
-UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
-                   std::initializer_list<T> rhs) {
-  return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
-}
-
-// Matches an STL-style container or a native array that contains at
-// least one element matching the given value or matcher.
-//
-// Examples:
-//   ::std::set<int> page_ids;
-//   page_ids.insert(3);
-//   page_ids.insert(1);
-//   EXPECT_THAT(page_ids, Contains(1));
-//   EXPECT_THAT(page_ids, Contains(Gt(2)));
-//   EXPECT_THAT(page_ids, Not(Contains(4)));  // See below for Times(0)
-//
-//   ::std::map<int, size_t> page_lengths;
-//   page_lengths[1] = 100;
-//   EXPECT_THAT(page_lengths,
-//               Contains(::std::pair<const int, size_t>(1, 100)));
-//
-//   const char* user_ids[] = { "joe", "mike", "tom" };
-//   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
-//
-// The matcher supports a modifier `Times` that allows to check for arbitrary
-// occurrences including testing for absence with Times(0).
-//
-// Examples:
-//   ::std::vector<int> ids;
-//   ids.insert(1);
-//   ids.insert(1);
-//   ids.insert(3);
-//   EXPECT_THAT(ids, Contains(1).Times(2));      // 1 occurs 2 times
-//   EXPECT_THAT(ids, Contains(2).Times(0));      // 2 is not present
-//   EXPECT_THAT(ids, Contains(3).Times(Ge(1)));  // 3 occurs at least once
-
-template <typename M>
-inline internal::ContainsMatcher<M> Contains(M matcher) {
-  return internal::ContainsMatcher<M>(matcher);
-}
-
-// IsSupersetOf(iterator_first, iterator_last)
-// IsSupersetOf(pointer, count)
-// IsSupersetOf(array)
-// IsSupersetOf(container)
-// IsSupersetOf({e1, e2, ..., en})
-//
-// IsSupersetOf() verifies that a surjective partial mapping onto a collection
-// of matchers exists. In other words, a container matches
-// IsSupersetOf({e1, ..., en}) if and only if there is a permutation
-// {y1, ..., yn} of some of the container's elements where y1 matches e1,
-// ..., and yn matches en. Obviously, the size of the container must be >= n
-// in order to have a match. Examples:
-//
-// - {1, 2, 3} matches IsSupersetOf({Ge(3), Ne(0)}), as 3 matches Ge(3) and
-//   1 matches Ne(0).
-// - {1, 2} doesn't match IsSupersetOf({Eq(1), Lt(2)}), even though 1 matches
-//   both Eq(1) and Lt(2). The reason is that different matchers must be used
-//   for elements in different slots of the container.
-// - {1, 1, 2} matches IsSupersetOf({Eq(1), Lt(2)}), as (the first) 1 matches
-//   Eq(1) and (the second) 1 matches Lt(2).
-// - {1, 2, 3} matches IsSupersetOf(Gt(1), Gt(1)), as 2 matches (the first)
-//   Gt(1) and 3 matches (the second) Gt(1).
-//
-// The matchers can be specified as an array, a pointer and count, a container,
-// an initializer list, or an STL iterator range. In each of these cases, the
-// underlying matchers can be either values or matchers.
-
-template <typename Iter>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-IsSupersetOf(Iter first, Iter last) {
-  typedef typename ::std::iterator_traits<Iter>::value_type T;
-  return internal::UnorderedElementsAreArrayMatcher<T>(
-      internal::UnorderedMatcherRequire::Superset, first, last);
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
-    const T* pointer, size_t count) {
-  return IsSupersetOf(pointer, pointer + count);
-}
-
-template <typename T, size_t N>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
-    const T (&array)[N]) {
-  return IsSupersetOf(array, N);
-}
-
-template <typename Container>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename Container::value_type>
-IsSupersetOf(const Container& container) {
-  return IsSupersetOf(container.begin(), container.end());
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
-    ::std::initializer_list<T> xs) {
-  return IsSupersetOf(xs.begin(), xs.end());
-}
-
-// IsSubsetOf(iterator_first, iterator_last)
-// IsSubsetOf(pointer, count)
-// IsSubsetOf(array)
-// IsSubsetOf(container)
-// IsSubsetOf({e1, e2, ..., en})
-//
-// IsSubsetOf() verifies that an injective mapping onto a collection of matchers
-// exists.  In other words, a container matches IsSubsetOf({e1, ..., en}) if and
-// only if there is a subset of matchers {m1, ..., mk} which would match the
-// container using UnorderedElementsAre.  Obviously, the size of the container
-// must be <= n in order to have a match. Examples:
-//
-// - {1} matches IsSubsetOf({Gt(0), Lt(0)}), as 1 matches Gt(0).
-// - {1, -1} matches IsSubsetOf({Lt(0), Gt(0)}), as 1 matches Gt(0) and -1
-//   matches Lt(0).
-// - {1, 2} doesn't matches IsSubsetOf({Gt(0), Lt(0)}), even though 1 and 2 both
-//   match Gt(0). The reason is that different matchers must be used for
-//   elements in different slots of the container.
-//
-// The matchers can be specified as an array, a pointer and count, a container,
-// an initializer list, or an STL iterator range. In each of these cases, the
-// underlying matchers can be either values or matchers.
-
-template <typename Iter>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-IsSubsetOf(Iter first, Iter last) {
-  typedef typename ::std::iterator_traits<Iter>::value_type T;
-  return internal::UnorderedElementsAreArrayMatcher<T>(
-      internal::UnorderedMatcherRequire::Subset, first, last);
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
-    const T* pointer, size_t count) {
-  return IsSubsetOf(pointer, pointer + count);
-}
-
-template <typename T, size_t N>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
-    const T (&array)[N]) {
-  return IsSubsetOf(array, N);
-}
-
-template <typename Container>
-inline internal::UnorderedElementsAreArrayMatcher<
-    typename Container::value_type>
-IsSubsetOf(const Container& container) {
-  return IsSubsetOf(container.begin(), container.end());
-}
-
-template <typename T>
-inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
-    ::std::initializer_list<T> xs) {
-  return IsSubsetOf(xs.begin(), xs.end());
-}
-
-// Matches an STL-style container or a native array that contains only
-// elements matching the given value or matcher.
-//
-// Each(m) is semantically equivalent to `Not(Contains(Not(m)))`. Only
-// the messages are different.
-//
-// Examples:
-//   ::std::set<int> page_ids;
-//   // Each(m) matches an empty container, regardless of what m is.
-//   EXPECT_THAT(page_ids, Each(Eq(1)));
-//   EXPECT_THAT(page_ids, Each(Eq(77)));
-//
-//   page_ids.insert(3);
-//   EXPECT_THAT(page_ids, Each(Gt(0)));
-//   EXPECT_THAT(page_ids, Not(Each(Gt(4))));
-//   page_ids.insert(1);
-//   EXPECT_THAT(page_ids, Not(Each(Lt(2))));
-//
-//   ::std::map<int, size_t> page_lengths;
-//   page_lengths[1] = 100;
-//   page_lengths[2] = 200;
-//   page_lengths[3] = 300;
-//   EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
-//   EXPECT_THAT(page_lengths, Each(Key(Le(3))));
-//
-//   const char* user_ids[] = { "joe", "mike", "tom" };
-//   EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
-template <typename M>
-inline internal::EachMatcher<M> Each(M matcher) {
-  return internal::EachMatcher<M>(matcher);
-}
-
-// Key(inner_matcher) matches an std::pair whose 'first' field matches
-// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
-// std::map that contains at least one element whose key is >= 5.
-template <typename M>
-inline internal::KeyMatcher<M> Key(M inner_matcher) {
-  return internal::KeyMatcher<M>(inner_matcher);
-}
-
-// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
-// matches first_matcher and whose 'second' field matches second_matcher.  For
-// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
-// to match a std::map<int, string> that contains exactly one element whose key
-// is >= 5 and whose value equals "foo".
-template <typename FirstMatcher, typename SecondMatcher>
-inline internal::PairMatcher<FirstMatcher, SecondMatcher> Pair(
-    FirstMatcher first_matcher, SecondMatcher second_matcher) {
-  return internal::PairMatcher<FirstMatcher, SecondMatcher>(first_matcher,
-                                                            second_matcher);
-}
-
-namespace no_adl {
-// Conditional() creates a matcher that conditionally uses either the first or
-// second matcher provided. For example, we could create an `equal if, and only
-// if' matcher using the Conditional wrapper as follows:
-//
-//   EXPECT_THAT(result, Conditional(condition, Eq(expected), Ne(expected)));
-template <typename MatcherTrue, typename MatcherFalse>
-internal::ConditionalMatcher<MatcherTrue, MatcherFalse> Conditional(
-    bool condition, MatcherTrue matcher_true, MatcherFalse matcher_false) {
-  return internal::ConditionalMatcher<MatcherTrue, MatcherFalse>(
-      condition, std::move(matcher_true), std::move(matcher_false));
-}
-
-// FieldsAre(matchers...) matches piecewise the fields of compatible structs.
-// These include those that support `get<I>(obj)`, and when structured bindings
-// are enabled any class that supports them.
-// In particular, `std::tuple`, `std::pair`, `std::array` and aggregate types.
-template <typename... M>
-internal::FieldsAreMatcher<typename std::decay<M>::type...> FieldsAre(
-    M&&... matchers) {
-  return internal::FieldsAreMatcher<typename std::decay<M>::type...>(
-      std::forward<M>(matchers)...);
-}
-
-// Creates a matcher that matches a pointer (raw or smart) that matches
-// inner_matcher.
-template <typename InnerMatcher>
-inline internal::PointerMatcher<InnerMatcher> Pointer(
-    const InnerMatcher& inner_matcher) {
-  return internal::PointerMatcher<InnerMatcher>(inner_matcher);
-}
-
-// Creates a matcher that matches an object that has an address that matches
-// inner_matcher.
-template <typename InnerMatcher>
-inline internal::AddressMatcher<InnerMatcher> Address(
-    const InnerMatcher& inner_matcher) {
-  return internal::AddressMatcher<InnerMatcher>(inner_matcher);
-}
-
-// Matches a base64 escaped string, when the unescaped string matches the
-// internal matcher.
-template <typename MatcherType>
-internal::WhenBase64UnescapedMatcher WhenBase64Unescaped(
-    const MatcherType& internal_matcher) {
-  return internal::WhenBase64UnescapedMatcher(internal_matcher);
-}
-}  // namespace no_adl
-
-// Returns a predicate that is satisfied by anything that matches the
-// given matcher.
-template <typename M>
-inline internal::MatcherAsPredicate<M> Matches(M matcher) {
-  return internal::MatcherAsPredicate<M>(matcher);
-}
-
-// Returns true if and only if the value matches the matcher.
-template <typename T, typename M>
-inline bool Value(const T& value, M matcher) {
-  return testing::Matches(matcher)(value);
-}
-
-// Matches the value against the given matcher and explains the match
-// result to listener.
-template <typename T, typename M>
-inline bool ExplainMatchResult(M matcher, const T& value,
-                               MatchResultListener* listener) {
-  return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
-}
-
-// Returns a string representation of the given matcher.  Useful for description
-// strings of matchers defined using MATCHER_P* macros that accept matchers as
-// their arguments.  For example:
-//
-// MATCHER_P(XAndYThat, matcher,
-//           "X that " + DescribeMatcher<int>(matcher, negation) +
-//               (negation ? " or" : " and") + " Y that " +
-//               DescribeMatcher<double>(matcher, negation)) {
-//   return ExplainMatchResult(matcher, arg.x(), result_listener) &&
-//          ExplainMatchResult(matcher, arg.y(), result_listener);
-// }
-template <typename T, typename M>
-std::string DescribeMatcher(const M& matcher, bool negation = false) {
-  ::std::stringstream ss;
-  Matcher<T> monomorphic_matcher = SafeMatcherCast<T>(matcher);
-  if (negation) {
-    monomorphic_matcher.DescribeNegationTo(&ss);
-  } else {
-    monomorphic_matcher.DescribeTo(&ss);
-  }
-  return ss.str();
-}
-
-template <typename... Args>
-internal::ElementsAreMatcher<
-    std::tuple<typename std::decay<const Args&>::type...>>
-ElementsAre(const Args&... matchers) {
-  return internal::ElementsAreMatcher<
-      std::tuple<typename std::decay<const Args&>::type...>>(
-      std::make_tuple(matchers...));
-}
-
-template <typename... Args>
-internal::UnorderedElementsAreMatcher<
-    std::tuple<typename std::decay<const Args&>::type...>>
-UnorderedElementsAre(const Args&... matchers) {
-  return internal::UnorderedElementsAreMatcher<
-      std::tuple<typename std::decay<const Args&>::type...>>(
-      std::make_tuple(matchers...));
-}
-
-// Define variadic matcher versions.
-template <typename... Args>
-internal::AllOfMatcher<typename std::decay<const Args&>::type...> AllOf(
-    const Args&... matchers) {
-  return internal::AllOfMatcher<typename std::decay<const Args&>::type...>(
-      matchers...);
-}
-
-template <typename... Args>
-internal::AnyOfMatcher<typename std::decay<const Args&>::type...> AnyOf(
-    const Args&... matchers) {
-  return internal::AnyOfMatcher<typename std::decay<const Args&>::type...>(
-      matchers...);
-}
-
-// AnyOfArray(array)
-// AnyOfArray(pointer, count)
-// AnyOfArray(container)
-// AnyOfArray({ e1, e2, ..., en })
-// AnyOfArray(iterator_first, iterator_last)
-//
-// AnyOfArray() verifies whether a given value matches any member of a
-// collection of matchers.
-//
-// AllOfArray(array)
-// AllOfArray(pointer, count)
-// AllOfArray(container)
-// AllOfArray({ e1, e2, ..., en })
-// AllOfArray(iterator_first, iterator_last)
-//
-// AllOfArray() verifies whether a given value matches all members of a
-// collection of matchers.
-//
-// The matchers can be specified as an array, a pointer and count, a container,
-// an initializer list, or an STL iterator range. In each of these cases, the
-// underlying matchers can be either values or matchers.
-
-template <typename Iter>
-inline internal::AnyOfArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-AnyOfArray(Iter first, Iter last) {
-  return internal::AnyOfArrayMatcher<
-      typename ::std::iterator_traits<Iter>::value_type>(first, last);
-}
-
-template <typename Iter>
-inline internal::AllOfArrayMatcher<
-    typename ::std::iterator_traits<Iter>::value_type>
-AllOfArray(Iter first, Iter last) {
-  return internal::AllOfArrayMatcher<
-      typename ::std::iterator_traits<Iter>::value_type>(first, last);
-}
-
-template <typename T>
-inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T* ptr, size_t count) {
-  return AnyOfArray(ptr, ptr + count);
-}
-
-template <typename T>
-inline internal::AllOfArrayMatcher<T> AllOfArray(const T* ptr, size_t count) {
-  return AllOfArray(ptr, ptr + count);
-}
-
-template <typename T, size_t N>
-inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T (&array)[N]) {
-  return AnyOfArray(array, N);
-}
-
-template <typename T, size_t N>
-inline internal::AllOfArrayMatcher<T> AllOfArray(const T (&array)[N]) {
-  return AllOfArray(array, N);
-}
-
-template <typename Container>
-inline internal::AnyOfArrayMatcher<typename Container::value_type> AnyOfArray(
-    const Container& container) {
-  return AnyOfArray(container.begin(), container.end());
-}
-
-template <typename Container>
-inline internal::AllOfArrayMatcher<typename Container::value_type> AllOfArray(
-    const Container& container) {
-  return AllOfArray(container.begin(), container.end());
-}
-
-template <typename T>
-inline internal::AnyOfArrayMatcher<T> AnyOfArray(
-    ::std::initializer_list<T> xs) {
-  return AnyOfArray(xs.begin(), xs.end());
-}
-
-template <typename T>
-inline internal::AllOfArrayMatcher<T> AllOfArray(
-    ::std::initializer_list<T> xs) {
-  return AllOfArray(xs.begin(), xs.end());
-}
-
-// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
-// fields of it matches a_matcher.  C++ doesn't support default
-// arguments for function templates, so we have to overload it.
-template <size_t... k, typename InnerMatcher>
-internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
-    InnerMatcher&& matcher) {
-  return internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...>(
-      std::forward<InnerMatcher>(matcher));
-}
-
-// AllArgs(m) is a synonym of m.  This is useful in
-//
-//   EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
-//
-// which is easier to read than
-//
-//   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
-template <typename InnerMatcher>
-inline InnerMatcher AllArgs(const InnerMatcher& matcher) {
-  return matcher;
-}
-
-// Returns a matcher that matches the value of an optional<> type variable.
-// The matcher implementation only uses '!arg' and requires that the optional<>
-// type has a 'value_type' member type and that '*arg' is of type 'value_type'
-// and is printable using 'PrintToString'. It is compatible with
-// std::optional/std::experimental::optional.
-// Note that to compare an optional type variable against nullopt you should
-// use Eq(nullopt) and not Eq(Optional(nullopt)). The latter implies that the
-// optional value contains an optional itself.
-template <typename ValueMatcher>
-inline internal::OptionalMatcher<ValueMatcher> Optional(
-    const ValueMatcher& value_matcher) {
-  return internal::OptionalMatcher<ValueMatcher>(value_matcher);
-}
-
-// Returns a matcher that matches the value of a absl::any type variable.
-template <typename T>
-PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T>> AnyWith(
-    const Matcher<const T&>& matcher) {
-  return MakePolymorphicMatcher(
-      internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
-}
-
-// Returns a matcher that matches the value of a variant<> type variable.
-// The matcher implementation uses ADL to find the holds_alternative and get
-// functions.
-// It is compatible with std::variant.
-template <typename T>
-PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T>> VariantWith(
-    const Matcher<const T&>& matcher) {
-  return MakePolymorphicMatcher(
-      internal::variant_matcher::VariantMatcher<T>(matcher));
-}
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Anything inside the `internal` namespace is internal to the implementation
-// and must not be used in user code!
-namespace internal {
-
-class WithWhatMatcherImpl {
- public:
-  WithWhatMatcherImpl(Matcher<std::string> matcher)
-      : matcher_(std::move(matcher)) {}
-
-  void DescribeTo(std::ostream* os) const {
-    *os << "contains .what() that ";
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(std::ostream* os) const {
-    *os << "contains .what() that does not ";
-    matcher_.DescribeTo(os);
-  }
-
-  template <typename Err>
-  bool MatchAndExplain(const Err& err, MatchResultListener* listener) const {
-    *listener << "which contains .what() (of value = " << err.what()
-              << ") that ";
-    return matcher_.MatchAndExplain(err.what(), listener);
-  }
-
- private:
-  const Matcher<std::string> matcher_;
-};
-
-inline PolymorphicMatcher<WithWhatMatcherImpl> WithWhat(
-    Matcher<std::string> m) {
-  return MakePolymorphicMatcher(WithWhatMatcherImpl(std::move(m)));
-}
-
-template <typename Err>
-class ExceptionMatcherImpl {
-  class NeverThrown {
-   public:
-    const char* what() const noexcept {
-      return "this exception should never be thrown";
-    }
-  };
-
-  // If the matchee raises an exception of a wrong type, we'd like to
-  // catch it and print its message and type. To do that, we add an additional
-  // catch clause:
-  //
-  //     try { ... }
-  //     catch (const Err&) { /* an expected exception */ }
-  //     catch (const std::exception&) { /* exception of a wrong type */ }
-  //
-  // However, if the `Err` itself is `std::exception`, we'd end up with two
-  // identical `catch` clauses:
-  //
-  //     try { ... }
-  //     catch (const std::exception&) { /* an expected exception */ }
-  //     catch (const std::exception&) { /* exception of a wrong type */ }
-  //
-  // This can cause a warning or an error in some compilers. To resolve
-  // the issue, we use a fake error type whenever `Err` is `std::exception`:
-  //
-  //     try { ... }
-  //     catch (const std::exception&) { /* an expected exception */ }
-  //     catch (const NeverThrown&) { /* exception of a wrong type */ }
-  using DefaultExceptionType = typename std::conditional<
-      std::is_same<typename std::remove_cv<
-                       typename std::remove_reference<Err>::type>::type,
-                   std::exception>::value,
-      const NeverThrown&, const std::exception&>::type;
-
- public:
-  ExceptionMatcherImpl(Matcher<const Err&> matcher)
-      : matcher_(std::move(matcher)) {}
-
-  void DescribeTo(std::ostream* os) const {
-    *os << "throws an exception which is a " << GetTypeName<Err>();
-    *os << " which ";
-    matcher_.DescribeTo(os);
-  }
-
-  void DescribeNegationTo(std::ostream* os) const {
-    *os << "throws an exception which is not a " << GetTypeName<Err>();
-    *os << " which ";
-    matcher_.DescribeNegationTo(os);
-  }
-
-  template <typename T>
-  bool MatchAndExplain(T&& x, MatchResultListener* listener) const {
-    try {
-      (void)(std::forward<T>(x)());
-    } catch (const Err& err) {
-      *listener << "throws an exception which is a " << GetTypeName<Err>();
-      *listener << " ";
-      return matcher_.MatchAndExplain(err, listener);
-    } catch (DefaultExceptionType err) {
-#if GTEST_HAS_RTTI
-      *listener << "throws an exception of type " << GetTypeName(typeid(err));
-      *listener << " ";
-#else
-      *listener << "throws an std::exception-derived type ";
-#endif
-      *listener << "with description \"" << err.what() << "\"";
-      return false;
-    } catch (...) {
-      *listener << "throws an exception of an unknown type";
-      return false;
-    }
-
-    *listener << "does not throw any exception";
-    return false;
-  }
-
- private:
-  const Matcher<const Err&> matcher_;
-};
-
-}  // namespace internal
-
-// Throws()
-// Throws(exceptionMatcher)
-// ThrowsMessage(messageMatcher)
-//
-// This matcher accepts a callable and verifies that when invoked, it throws
-// an exception with the given type and properties.
-//
-// Examples:
-//
-//   EXPECT_THAT(
-//       []() { throw std::runtime_error("message"); },
-//       Throws<std::runtime_error>());
-//
-//   EXPECT_THAT(
-//       []() { throw std::runtime_error("message"); },
-//       ThrowsMessage<std::runtime_error>(HasSubstr("message")));
-//
-//   EXPECT_THAT(
-//       []() { throw std::runtime_error("message"); },
-//       Throws<std::runtime_error>(
-//           Property(&std::runtime_error::what, HasSubstr("message"))));
-
-template <typename Err>
-PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws() {
-  return MakePolymorphicMatcher(
-      internal::ExceptionMatcherImpl<Err>(A<const Err&>()));
-}
-
-template <typename Err, typename ExceptionMatcher>
-PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws(
-    const ExceptionMatcher& exception_matcher) {
-  // Using matcher cast allows users to pass a matcher of a more broad type.
-  // For example user may want to pass Matcher<std::exception>
-  // to Throws<std::runtime_error>, or Matcher<int64> to Throws<int32>.
-  return MakePolymorphicMatcher(internal::ExceptionMatcherImpl<Err>(
-      SafeMatcherCast<const Err&>(exception_matcher)));
-}
-
-template <typename Err, typename MessageMatcher>
-PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
-    MessageMatcher&& message_matcher) {
-  static_assert(std::is_base_of<std::exception, Err>::value,
-                "expected an std::exception-derived type");
-  return Throws<Err>(internal::WithWhat(
-      MatcherCast<std::string>(std::forward<MessageMatcher>(message_matcher))));
-}
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// These macros allow using matchers to check values in Google Test
-// tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
-// succeed if and only if the value matches the matcher.  If the assertion
-// fails, the value and the description of the matcher will be printed.
-#define ASSERT_THAT(value, matcher) \
-  ASSERT_PRED_FORMAT1(              \
-      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-#define EXPECT_THAT(value, matcher) \
-  EXPECT_PRED_FORMAT1(              \
-      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-
-// MATCHER* macros itself are listed below.
-#define MATCHER(name, description)                                             \
-  class name##Matcher                                                          \
-      : public ::testing::internal::MatcherBaseImpl<name##Matcher> {           \
-   public:                                                                     \
-    template <typename arg_type>                                               \
-    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
-     public:                                                                   \
-      gmock_Impl() {}                                                          \
-      bool MatchAndExplain(                                                    \
-          const arg_type& arg,                                                 \
-          ::testing::MatchResultListener* result_listener) const override;     \
-      void DescribeTo(::std::ostream* gmock_os) const override {               \
-        *gmock_os << FormatDescription(false);                                 \
-      }                                                                        \
-      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
-        *gmock_os << FormatDescription(true);                                  \
-      }                                                                        \
-                                                                               \
-     private:                                                                  \
-      ::std::string FormatDescription(bool negation) const {                   \
-        /* NOLINTNEXTLINE readability-redundant-string-init */                 \
-        ::std::string gmock_description = (description);                       \
-        if (!gmock_description.empty()) {                                      \
-          return gmock_description;                                            \
-        }                                                                      \
-        return ::testing::internal::FormatMatcherDescription(negation, #name,  \
-                                                             {}, {});          \
-      }                                                                        \
-    };                                                                         \
-  };                                                                           \
-  GTEST_ATTRIBUTE_UNUSED_ inline name##Matcher name() { return {}; }           \
-  template <typename arg_type>                                                 \
-  bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain(                   \
-      const arg_type& arg,                                                     \
-      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_) \
-      const
-
-#define MATCHER_P(name, p0, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (#p0), (p0))
-#define MATCHER_P2(name, p0, p1, description)                            \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (#p0, #p1), \
-                         (p0, p1))
-#define MATCHER_P3(name, p0, p1, p2, description)                             \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (#p0, #p1, #p2), \
-                         (p0, p1, p2))
-#define MATCHER_P4(name, p0, p1, p2, p3, description)        \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, \
-                         (#p0, #p1, #p2, #p3), (p0, p1, p2, p3))
-#define MATCHER_P5(name, p0, p1, p2, p3, p4, description)    \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP5, description, \
-                         (#p0, #p1, #p2, #p3, #p4), (p0, p1, p2, p3, p4))
-#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP6, description,  \
-                         (#p0, #p1, #p2, #p3, #p4, #p5),      \
-                         (p0, p1, p2, p3, p4, p5))
-#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP7, description,      \
-                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6),     \
-                         (p0, p1, p2, p3, p4, p5, p6))
-#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP8, description,          \
-                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7),    \
-                         (p0, p1, p2, p3, p4, p5, p6, p7))
-#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP9, description,              \
-                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8),   \
-                         (p0, p1, p2, p3, p4, p5, p6, p7, p8))
-#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description) \
-  GMOCK_INTERNAL_MATCHER(name, name##MatcherP10, description,                  \
-                         (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8, #p9),   \
-                         (p0, p1, p2, p3, p4, p5, p6, p7, p8, p9))
-
-#define GMOCK_INTERNAL_MATCHER(name, full_name, description, arg_names, args)  \
-  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
-  class full_name : public ::testing::internal::MatcherBaseImpl<               \
-                        full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>> { \
-   public:                                                                     \
-    using full_name::MatcherBaseImpl::MatcherBaseImpl;                         \
-    template <typename arg_type>                                               \
-    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
-     public:                                                                   \
-      explicit gmock_Impl(GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args))          \
-          : GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) {}                       \
-      bool MatchAndExplain(                                                    \
-          const arg_type& arg,                                                 \
-          ::testing::MatchResultListener* result_listener) const override;     \
-      void DescribeTo(::std::ostream* gmock_os) const override {               \
-        *gmock_os << FormatDescription(false);                                 \
-      }                                                                        \
-      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
-        *gmock_os << FormatDescription(true);                                  \
-      }                                                                        \
-      GMOCK_INTERNAL_MATCHER_MEMBERS(args)                                     \
-                                                                               \
-     private:                                                                  \
-      ::std::string FormatDescription(bool negation) const {                   \
-        ::std::string gmock_description = (description);                       \
-        if (!gmock_description.empty()) {                                      \
-          return gmock_description;                                            \
-        }                                                                      \
-        return ::testing::internal::FormatMatcherDescription(                  \
-            negation, #name, {GMOCK_PP_REMOVE_PARENS(arg_names)},              \
-            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(      \
-                ::std::tuple<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(        \
-                    GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args))));             \
-      }                                                                        \
-    };                                                                         \
-  };                                                                           \
-  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
-  inline full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)> name(             \
-      GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args)) {                            \
-    return full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(                \
-        GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args));                              \
-  }                                                                            \
-  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
-  template <typename arg_type>                                                 \
-  bool full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>::gmock_Impl<        \
-      arg_type>::MatchAndExplain(const arg_type& arg,                          \
-                                 ::testing::MatchResultListener*               \
-                                     result_listener GTEST_ATTRIBUTE_UNUSED_)  \
-      const
-
-#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args) \
-  GMOCK_PP_TAIL(                                     \
-      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM, , args))
-#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM(i_unused, data_unused, arg) \
-  , typename arg##_type
-
-#define GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TYPE_PARAM, , args))
-#define GMOCK_INTERNAL_MATCHER_TYPE_PARAM(i_unused, data_unused, arg) \
-  , arg##_type
-
-#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args) \
-  GMOCK_PP_TAIL(dummy_first GMOCK_PP_FOR_EACH(     \
-      GMOCK_INTERNAL_MATCHER_FUNCTION_ARG, , args))
-#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARG(i, data_unused, arg) \
-  , arg##_type gmock_p##i
-
-#define GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_FORWARD_ARG, , args))
-#define GMOCK_INTERNAL_MATCHER_FORWARD_ARG(i, data_unused, arg) \
-  , arg(::std::forward<arg##_type>(gmock_p##i))
-
-#define GMOCK_INTERNAL_MATCHER_MEMBERS(args) \
-  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER, , args)
-#define GMOCK_INTERNAL_MATCHER_MEMBER(i_unused, data_unused, arg) \
-  const arg##_type arg;
-
-#define GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER_USAGE, , args))
-#define GMOCK_INTERNAL_MATCHER_MEMBER_USAGE(i_unused, data_unused, arg) , arg
-
-#define GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args) \
-  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_ARG_USAGE, , args))
-#define GMOCK_INTERNAL_MATCHER_ARG_USAGE(i, data_unused, arg_unused) \
-  , gmock_p##i
-
-// To prevent ADL on certain functions we put them on a separate namespace.
-using namespace no_adl;  // NOLINT
-
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
-
-// Include any custom callback matchers added by the local installation.
-// We must include this header at the end to make sure it can use the
-// declarations from this file.
-#include "gmock/internal/custom/gmock-matchers.h"
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h b/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
deleted file mode 100644
index 148ac01721..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
+++ /dev/null
@@ -1,662 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements some commonly used variadic actions.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
-
-#include <memory>
-#include <utility>
-
-#include "gmock/gmock-actions.h"
-#include "gmock/internal/gmock-port.h"
-
-// Include any custom callback actions added by the local installation.
-#include "gmock/internal/custom/gmock-generated-actions.h"
-
-// Sometimes you want to give an action explicit template parameters
-// that cannot be inferred from its value parameters.  ACTION() and
-// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
-// and can be viewed as an extension to ACTION() and ACTION_P*().
-//
-// The syntax:
-//
-//   ACTION_TEMPLATE(ActionName,
-//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
-//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
-//
-// defines an action template that takes m explicit template
-// parameters and n value parameters.  name_i is the name of the i-th
-// template parameter, and kind_i specifies whether it's a typename,
-// an integral constant, or a template.  p_i is the name of the i-th
-// value parameter.
-//
-// Example:
-//
-//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
-//   // function to type T and copies it to *output.
-//   ACTION_TEMPLATE(DuplicateArg,
-//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
-//                   AND_1_VALUE_PARAMS(output)) {
-//     *output = T(::std::get<k>(args));
-//   }
-//   ...
-//     int n;
-//     EXPECT_CALL(mock, Foo(_, _))
-//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
-//
-// To create an instance of an action template, write:
-//
-//   ActionName<t1, ..., t_m>(v1, ..., v_n)
-//
-// where the ts are the template arguments and the vs are the value
-// arguments.  The value argument types are inferred by the compiler.
-// If you want to explicitly specify the value argument types, you can
-// provide additional template arguments:
-//
-//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
-//
-// where u_i is the desired type of v_i.
-//
-// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
-// number of value parameters, but not on the number of template
-// parameters.  Without the restriction, the meaning of the following
-// is unclear:
-//
-//   OverloadedAction<int, bool>(x);
-//
-// Are we using a single-template-parameter action where 'bool' refers
-// to the type of x, or are we using a two-template-parameter action
-// where the compiler is asked to infer the type of x?
-//
-// Implementation notes:
-//
-// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
-// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
-// implementing ACTION_TEMPLATE.  The main trick we use is to create
-// new macro invocations when expanding a macro.  For example, we have
-//
-//   #define ACTION_TEMPLATE(name, template_params, value_params)
-//       ... GMOCK_INTERNAL_DECL_##template_params ...
-//
-// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
-// to expand to
-//
-//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
-//
-// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
-// preprocessor will continue to expand it to
-//
-//       ... typename T ...
-//
-// This technique conforms to the C++ standard and is portable.  It
-// allows us to implement action templates using O(N) code, where N is
-// the maximum number of template/value parameters supported.  Without
-// using it, we'd have to devote O(N^2) amount of code to implement all
-// combinations of m and n.
-
-// Declares the template parameters.
-#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
-#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
-  kind0 name0, kind1 name1
-#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2)               \
-  kind0 name0, kind1 name1, kind2 name2
-#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2, kind3, name3) \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3
-#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4
-#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2, kind3, name3, \
-                                                  kind4, name4, kind5, name5) \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
-#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6)                                           \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
-      kind5 name5, kind6 name6
-#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7)                             \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
-      kind5 name5, kind6 name6, kind7 name7
-#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7, kind8, name8)               \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
-      kind5 name5, kind6 name6, kind7 name7, kind8 name8
-#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(                       \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
-  kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4,        \
-      kind5 name5, kind6 name6, kind7 name7, kind8 name8, kind9 name9
-
-// Lists the template parameters.
-#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
-#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
-  name0, name1
-#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2)               \
-  name0, name1, name2
-#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2, kind3, name3) \
-  name0, name1, name2, name3
-#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
-  name0, name1, name2, name3, name4
-#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
-                                                  kind2, name2, kind3, name3, \
-                                                  kind4, name4, kind5, name5) \
-  name0, name1, name2, name3, name4, name5
-#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6)                                           \
-  name0, name1, name2, name3, name4, name5, name6
-#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7)                             \
-  name0, name1, name2, name3, name4, name5, name6, name7
-#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(                        \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7, kind8, name8)               \
-  name0, name1, name2, name3, name4, name5, name6, name7, name8
-#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(                       \
-    kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
-    kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
-  name0, name1, name2, name3, name4, name5, name6, name7, name8, name9
-
-// Declares the types of value parameters.
-#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
-  , typename p0##_type, typename p1##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
-  , typename p0##_type, typename p1##_type, typename p2##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
-  , typename p0##_type, typename p1##_type, typename p2##_type,     \
-      typename p3##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  , typename p0##_type, typename p1##_type, typename p2##_type,         \
-      typename p3##_type, typename p4##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
-  , typename p0##_type, typename p1##_type, typename p2##_type,             \
-      typename p3##_type, typename p4##_type, typename p5##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6)                     \
-  , typename p0##_type, typename p1##_type, typename p2##_type,             \
-      typename p3##_type, typename p4##_type, typename p5##_type,           \
-      typename p6##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6, p7)                 \
-  , typename p0##_type, typename p1##_type, typename p2##_type,             \
-      typename p3##_type, typename p4##_type, typename p5##_type,           \
-      typename p6##_type, typename p7##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6, p7, p8)             \
-  , typename p0##_type, typename p1##_type, typename p2##_type,             \
-      typename p3##_type, typename p4##_type, typename p5##_type,           \
-      typename p6##_type, typename p7##_type, typename p8##_type
-#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                     p6, p7, p8, p9)         \
-  , typename p0##_type, typename p1##_type, typename p2##_type,              \
-      typename p3##_type, typename p4##_type, typename p5##_type,            \
-      typename p6##_type, typename p7##_type, typename p8##_type,            \
-      typename p9##_type
-
-// Initializes the value parameters.
-#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS() ()
-#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0) \
-  (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
-#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1) \
-  (p0##_type gmock_p0, p1##_type gmock_p1)             \
-      : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1))
-#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)     \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2) \
-      : p0(::std::move(gmock_p0)),                             \
-        p1(::std::move(gmock_p1)),                             \
-        p2(::std::move(gmock_p2))
-#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
-   p3##_type gmock_p3)                                         \
-      : p0(::std::move(gmock_p0)),                             \
-        p1(::std::move(gmock_p1)),                             \
-        p2(::std::move(gmock_p2)),                             \
-        p3(::std::move(gmock_p3))
-#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,     \
-   p3##_type gmock_p3, p4##_type gmock_p4)                         \
-      : p0(::std::move(gmock_p0)),                                 \
-        p1(::std::move(gmock_p1)),                                 \
-        p2(::std::move(gmock_p2)),                                 \
-        p3(::std::move(gmock_p3)),                                 \
-        p4(::std::move(gmock_p4))
-#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,         \
-   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5)         \
-      : p0(::std::move(gmock_p0)),                                     \
-        p1(::std::move(gmock_p1)),                                     \
-        p2(::std::move(gmock_p2)),                                     \
-        p3(::std::move(gmock_p3)),                                     \
-        p4(::std::move(gmock_p4)),                                     \
-        p5(::std::move(gmock_p5))
-#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,             \
-   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,             \
-   p6##_type gmock_p6)                                                     \
-      : p0(::std::move(gmock_p0)),                                         \
-        p1(::std::move(gmock_p1)),                                         \
-        p2(::std::move(gmock_p2)),                                         \
-        p3(::std::move(gmock_p3)),                                         \
-        p4(::std::move(gmock_p4)),                                         \
-        p5(::std::move(gmock_p5)),                                         \
-        p6(::std::move(gmock_p6))
-#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,                 \
-   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,                 \
-   p6##_type gmock_p6, p7##_type gmock_p7)                                     \
-      : p0(::std::move(gmock_p0)),                                             \
-        p1(::std::move(gmock_p1)),                                             \
-        p2(::std::move(gmock_p2)),                                             \
-        p3(::std::move(gmock_p3)),                                             \
-        p4(::std::move(gmock_p4)),                                             \
-        p5(::std::move(gmock_p5)),                                             \
-        p6(::std::move(gmock_p6)),                                             \
-        p7(::std::move(gmock_p7))
-#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
-                                               p8)                             \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,                 \
-   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,                 \
-   p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8)                 \
-      : p0(::std::move(gmock_p0)),                                             \
-        p1(::std::move(gmock_p1)),                                             \
-        p2(::std::move(gmock_p2)),                                             \
-        p3(::std::move(gmock_p3)),                                             \
-        p4(::std::move(gmock_p4)),                                             \
-        p5(::std::move(gmock_p5)),                                             \
-        p6(::std::move(gmock_p6)),                                             \
-        p7(::std::move(gmock_p7)),                                             \
-        p8(::std::move(gmock_p8))
-#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7, p8, p9)                 \
-  (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2,              \
-   p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5,              \
-   p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8,              \
-   p9##_type gmock_p9)                                                      \
-      : p0(::std::move(gmock_p0)),                                          \
-        p1(::std::move(gmock_p1)),                                          \
-        p2(::std::move(gmock_p2)),                                          \
-        p3(::std::move(gmock_p3)),                                          \
-        p4(::std::move(gmock_p4)),                                          \
-        p5(::std::move(gmock_p5)),                                          \
-        p6(::std::move(gmock_p6)),                                          \
-        p7(::std::move(gmock_p7)),                                          \
-        p8(::std::move(gmock_p8)),                                          \
-        p9(::std::move(gmock_p9))
-
-// Defines the copy constructor
-#define GMOCK_INTERNAL_DEFN_COPY_AND_0_VALUE_PARAMS() \
-  {}  // Avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82134
-#define GMOCK_INTERNAL_DEFN_COPY_AND_1_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_2_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_3_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_4_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_5_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_6_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_7_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_8_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_9_VALUE_PARAMS(...) = default;
-#define GMOCK_INTERNAL_DEFN_COPY_AND_10_VALUE_PARAMS(...) = default;
-
-// Declares the fields for storing the value parameters.
-#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
-#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) \
-  p0##_type p0;                                        \
-  p1##_type p1;
-#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) \
-  p0##_type p0;                                            \
-  p1##_type p1;                                            \
-  p2##_type p2;
-#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
-  p0##_type p0;                                                \
-  p1##_type p1;                                                \
-  p2##_type p2;                                                \
-  p3##_type p3;
-#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  p0##_type p0;                                                    \
-  p1##_type p1;                                                    \
-  p2##_type p2;                                                    \
-  p3##_type p3;                                                    \
-  p4##_type p4;
-#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
-  p0##_type p0;                                                        \
-  p1##_type p1;                                                        \
-  p2##_type p2;                                                        \
-  p3##_type p3;                                                        \
-  p4##_type p4;                                                        \
-  p5##_type p5;
-#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
-  p0##_type p0;                                                            \
-  p1##_type p1;                                                            \
-  p2##_type p2;                                                            \
-  p3##_type p3;                                                            \
-  p4##_type p4;                                                            \
-  p5##_type p5;                                                            \
-  p6##_type p6;
-#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
-  p0##_type p0;                                                                \
-  p1##_type p1;                                                                \
-  p2##_type p2;                                                                \
-  p3##_type p3;                                                                \
-  p4##_type p4;                                                                \
-  p5##_type p5;                                                                \
-  p6##_type p6;                                                                \
-  p7##_type p7;
-#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
-                                               p8)                             \
-  p0##_type p0;                                                                \
-  p1##_type p1;                                                                \
-  p2##_type p2;                                                                \
-  p3##_type p3;                                                                \
-  p4##_type p4;                                                                \
-  p5##_type p5;                                                                \
-  p6##_type p6;                                                                \
-  p7##_type p7;                                                                \
-  p8##_type p8;
-#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7, p8, p9)                 \
-  p0##_type p0;                                                             \
-  p1##_type p1;                                                             \
-  p2##_type p2;                                                             \
-  p3##_type p3;                                                             \
-  p4##_type p4;                                                             \
-  p5##_type p5;                                                             \
-  p6##_type p6;                                                             \
-  p7##_type p7;                                                             \
-  p8##_type p8;                                                             \
-  p9##_type p9;
-
-// Lists the value parameters.
-#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
-#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
-#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
-#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
-#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  p0, p1, p2, p3, p4
-#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
-  p0, p1, p2, p3, p4, p5
-#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
-  p0, p1, p2, p3, p4, p5, p6
-#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
-  p0, p1, p2, p3, p4, p5, p6, p7
-#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
-                                               p8)                             \
-  p0, p1, p2, p3, p4, p5, p6, p7, p8
-#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7, p8, p9)                 \
-  p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
-
-// Lists the value parameter types.
-#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
-  , p0##_type, p1##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
-  , p0##_type, p1##_type, p2##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
-  , p0##_type, p1##_type, p2##_type, p3##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6)                     \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, p6##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6, p7)                 \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,       \
-      p6##_type, p7##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                    p6, p7, p8)             \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,       \
-      p6##_type, p7##_type, p8##_type
-#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
-                                                     p6, p7, p8, p9)         \
-  , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type,        \
-      p6##_type, p7##_type, p8##_type, p9##_type
-
-// Declares the value parameters.
-#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
-#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) \
-  p0##_type p0, p1##_type p1
-#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) \
-  p0##_type p0, p1##_type p1, p2##_type p2
-#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3
-#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
-#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)  \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
-      p5##_type p5
-#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,    \
-      p5##_type p5, p6##_type p6
-#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,        \
-      p5##_type p5, p6##_type p6, p7##_type p7
-#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
-                                               p8)                             \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,        \
-      p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
-#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7, p8, p9)                 \
-  p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4,     \
-      p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, p9##_type p9
-
-// The suffix of the class template implementing the action template.
-#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
-#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
-#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
-#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
-#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
-#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
-#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
-#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
-#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7)                         \
-  P8
-#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                p7, p8)                     \
-  P9
-#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
-                                                 p7, p8, p9)                 \
-  P10
-
-// The name of the class template implementing the action template.
-#define GMOCK_ACTION_CLASS_(name, value_params) \
-  GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
-
-#define ACTION_TEMPLATE(name, template_params, value_params)                   \
-  template <GMOCK_INTERNAL_DECL_##template_params                              \
-                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
-  class GMOCK_ACTION_CLASS_(name, value_params) {                              \
-   public:                                                                     \
-    explicit GMOCK_ACTION_CLASS_(name, value_params)(                          \
-        GMOCK_INTERNAL_DECL_##value_params)                                    \
-        GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),    \
-                    = default;                                                 \
-                    ,                                                          \
-                    : impl_(std::make_shared<gmock_Impl>(                      \
-                        GMOCK_INTERNAL_LIST_##value_params)){})                \
-            GMOCK_ACTION_CLASS_(name, value_params)(const GMOCK_ACTION_CLASS_( \
-                name, value_params) &) noexcept GMOCK_INTERNAL_DEFN_COPY_      \
-        ##value_params GMOCK_ACTION_CLASS_(name, value_params)(                \
-            GMOCK_ACTION_CLASS_(name, value_params) &&) noexcept               \
-        GMOCK_INTERNAL_DEFN_COPY_##value_params template <typename F>          \
-        operator ::testing::Action<F>() const {                                \
-      return GMOCK_PP_IF(                                                      \
-          GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),              \
-          (::testing::internal::MakeAction<F, gmock_Impl>()),                  \
-          (::testing::internal::MakeAction<F>(impl_)));                        \
-    }                                                                          \
-                                                                               \
-   private:                                                                    \
-    class gmock_Impl {                                                         \
-     public:                                                                   \
-      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}                \
-      template <typename function_type, typename return_type,                  \
-                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>         \
-      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;  \
-      GMOCK_INTERNAL_DEFN_##value_params                                       \
-    };                                                                         \
-    GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params), ,      \
-                std::shared_ptr<const gmock_Impl> impl_;)                      \
-  };                                                                           \
-  template <GMOCK_INTERNAL_DECL_##template_params                              \
-                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
-  GMOCK_ACTION_CLASS_(                                                         \
-      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
-                              GMOCK_INTERNAL_LIST_TYPE_##value_params>         \
-      name(GMOCK_INTERNAL_DECL_##value_params) GTEST_MUST_USE_RESULT_;         \
-  template <GMOCK_INTERNAL_DECL_##template_params                              \
-                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
-  inline GMOCK_ACTION_CLASS_(                                                  \
-      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
-                              GMOCK_INTERNAL_LIST_TYPE_##value_params>         \
-  name(GMOCK_INTERNAL_DECL_##value_params) {                                   \
-    return GMOCK_ACTION_CLASS_(                                                \
-        name, value_params)<GMOCK_INTERNAL_LIST_##template_params              \
-                                GMOCK_INTERNAL_LIST_TYPE_##value_params>(      \
-        GMOCK_INTERNAL_LIST_##value_params);                                   \
-  }                                                                            \
-  template <GMOCK_INTERNAL_DECL_##template_params                              \
-                GMOCK_INTERNAL_DECL_TYPE_##value_params>                       \
-  template <typename function_type, typename return_type, typename args_type,  \
-            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                 \
-  return_type GMOCK_ACTION_CLASS_(                                             \
-      name, value_params)<GMOCK_INTERNAL_LIST_##template_params                \
-                              GMOCK_INTERNAL_LIST_TYPE_##value_params>::       \
-      gmock_Impl::gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_)  \
-          const
-
-namespace testing {
-
-// The ACTION*() macros trigger warning C4100 (unreferenced formal
-// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
-// the macro definition, as the warnings are generated when the macro
-// is expanded and macro expansion cannot contain #pragma.  Therefore
-// we suppress them here.
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#endif
-
-namespace internal {
-
-// internal::InvokeArgument - a helper for InvokeArgument action.
-// The basic overloads are provided here for generic functors.
-// Overloads for other custom-callables are provided in the
-// internal/custom/gmock-generated-actions.h header.
-template <typename F, typename... Args>
-auto InvokeArgument(F f, Args... args) -> decltype(f(args...)) {
-  return f(args...);
-}
-
-template <std::size_t index, typename... Params>
-struct InvokeArgumentAction {
-  template <typename... Args,
-            typename = typename std::enable_if<(index < sizeof...(Args))>::type>
-  auto operator()(Args&&... args) const -> decltype(internal::InvokeArgument(
-      std::get<index>(std::forward_as_tuple(std::forward<Args>(args)...)),
-      std::declval<const Params&>()...)) {
-    internal::FlatTuple<Args&&...> args_tuple(FlatTupleConstructTag{},
-                                              std::forward<Args>(args)...);
-    return params.Apply([&](const Params&... unpacked_params) {
-      auto&& callable = args_tuple.template Get<index>();
-      return internal::InvokeArgument(
-          std::forward<decltype(callable)>(callable), unpacked_params...);
-    });
-  }
-
-  internal::FlatTuple<Params...> params;
-};
-
-}  // namespace internal
-
-// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
-// (0-based) argument, which must be a k-ary callable, of the mock
-// function, with arguments a1, a2, ..., a_k.
-//
-// Notes:
-//
-//   1. The arguments are passed by value by default.  If you need to
-//   pass an argument by reference, wrap it inside std::ref().  For
-//   example,
-//
-//     InvokeArgument<1>(5, string("Hello"), std::ref(foo))
-//
-//   passes 5 and string("Hello") by value, and passes foo by
-//   reference.
-//
-//   2. If the callable takes an argument by reference but std::ref() is
-//   not used, it will receive the reference to a copy of the value,
-//   instead of the original value.  For example, when the 0-th
-//   argument of the mock function takes a const string&, the action
-//
-//     InvokeArgument<0>(string("Hello"))
-//
-//   makes a copy of the temporary string("Hello") object and passes a
-//   reference of the copy, instead of the original temporary object,
-//   to the callable.  This makes it easy for a user to define an
-//   InvokeArgument action from temporary values and have it performed
-//   later.
-template <std::size_t index, typename... Params>
-internal::InvokeArgumentAction<index, typename std::decay<Params>::type...>
-InvokeArgument(Params&&... params) {
-  return {internal::FlatTuple<typename std::decay<Params>::type...>(
-      internal::FlatTupleConstructTag{}, std::forward<Params>(params)...)};
-}
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-}  // namespace testing
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h b/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
deleted file mode 100644
index 47aaf98461..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2013, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements some matchers that depend on gmock-matchers.h.
-//
-// Note that tests are implemented in gmock-matchers_test.cc rather than
-// gmock-more-matchers-test.cc.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
-
-#include "gmock/gmock-matchers.h"
-
-namespace testing {
-
-// Silence C4100 (unreferenced formal
-// parameter) for MSVC
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#if (_MSC_VER == 1900)
-// and silence C4800 (C4800: 'int *const ': forcing value
-// to bool 'true' or 'false') for MSVC 14
-#pragma warning(disable : 4800)
-#endif
-#endif
-
-// Defines a matcher that matches an empty container. The container must
-// support both size() and empty(), which all STL-like containers provide.
-MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
-  if (arg.empty()) {
-    return true;
-  }
-  *result_listener << "whose size is " << arg.size();
-  return false;
-}
-
-// Define a matcher that matches a value that evaluates in boolean
-// context to true.  Useful for types that define "explicit operator
-// bool" operators and so can't be compared for equality with true
-// and false.
-MATCHER(IsTrue, negation ? "is false" : "is true") {
-  return static_cast<bool>(arg);
-}
-
-// Define a matcher that matches a value that evaluates in boolean
-// context to false.  Useful for types that define "explicit operator
-// bool" operators and so can't be compared for equality with true
-// and false.
-MATCHER(IsFalse, negation ? "is true" : "is false") {
-  return !static_cast<bool>(arg);
-}
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-}  // namespace testing
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h b/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
deleted file mode 100644
index 4f0eb35db7..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
+++ /dev/null
@@ -1,277 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Implements class templates NiceMock, NaggyMock, and StrictMock.
-//
-// Given a mock class MockFoo that is created using Google Mock,
-// NiceMock<MockFoo> is a subclass of MockFoo that allows
-// uninteresting calls (i.e. calls to mock methods that have no
-// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
-// that prints a warning when an uninteresting call occurs, and
-// StrictMock<MockFoo> is a subclass of MockFoo that treats all
-// uninteresting calls as errors.
-//
-// Currently a mock is naggy by default, so MockFoo and
-// NaggyMock<MockFoo> behave like the same.  However, we will soon
-// switch the default behavior of mocks to be nice, as that in general
-// leads to more maintainable tests.  When that happens, MockFoo will
-// stop behaving like NaggyMock<MockFoo> and start behaving like
-// NiceMock<MockFoo>.
-//
-// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
-// their respective base class.  Therefore you can write
-// NiceMock<MockFoo>(5, "a") to construct a nice mock where MockFoo
-// has a constructor that accepts (int, const char*), for example.
-//
-// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
-// and StrictMock<MockFoo> only works for mock methods defined using
-// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
-// If a mock method is defined in a base class of MockFoo, the "nice"
-// or "strict" modifier may not affect it, depending on the compiler.
-// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
-// supported.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
-
-#include <cstdint>
-#include <type_traits>
-
-#include "gmock/gmock-spec-builders.h"
-#include "gmock/internal/gmock-port.h"
-
-namespace testing {
-template <class MockClass>
-class NiceMock;
-template <class MockClass>
-class NaggyMock;
-template <class MockClass>
-class StrictMock;
-
-namespace internal {
-template <typename T>
-std::true_type StrictnessModifierProbe(const NiceMock<T>&);
-template <typename T>
-std::true_type StrictnessModifierProbe(const NaggyMock<T>&);
-template <typename T>
-std::true_type StrictnessModifierProbe(const StrictMock<T>&);
-std::false_type StrictnessModifierProbe(...);
-
-template <typename T>
-constexpr bool HasStrictnessModifier() {
-  return decltype(StrictnessModifierProbe(std::declval<const T&>()))::value;
-}
-
-// Base classes that register and deregister with testing::Mock to alter the
-// default behavior around uninteresting calls. Inheriting from one of these
-// classes first and then MockClass ensures the MockClass constructor is run
-// after registration, and that the MockClass destructor runs before
-// deregistration. This guarantees that MockClass's constructor and destructor
-// run with the same level of strictness as its instance methods.
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW && \
-    (defined(_MSC_VER) || defined(__clang__))
-// We need to mark these classes with this declspec to ensure that
-// the empty base class optimization is performed.
-#define GTEST_INTERNAL_EMPTY_BASE_CLASS __declspec(empty_bases)
-#else
-#define GTEST_INTERNAL_EMPTY_BASE_CLASS
-#endif
-
-template <typename Base>
-class NiceMockImpl {
- public:
-  NiceMockImpl() {
-    ::testing::Mock::AllowUninterestingCalls(reinterpret_cast<uintptr_t>(this));
-  }
-
-  ~NiceMockImpl() {
-    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
-  }
-};
-
-template <typename Base>
-class NaggyMockImpl {
- public:
-  NaggyMockImpl() {
-    ::testing::Mock::WarnUninterestingCalls(reinterpret_cast<uintptr_t>(this));
-  }
-
-  ~NaggyMockImpl() {
-    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
-  }
-};
-
-template <typename Base>
-class StrictMockImpl {
- public:
-  StrictMockImpl() {
-    ::testing::Mock::FailUninterestingCalls(reinterpret_cast<uintptr_t>(this));
-  }
-
-  ~StrictMockImpl() {
-    ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
-  }
-};
-
-}  // namespace internal
-
-template <class MockClass>
-class GTEST_INTERNAL_EMPTY_BASE_CLASS NiceMock
-    : private internal::NiceMockImpl<MockClass>,
-      public MockClass {
- public:
-  static_assert(!internal::HasStrictnessModifier<MockClass>(),
-                "Can't apply NiceMock to a class hierarchy that already has a "
-                "strictness modifier. See "
-                "https://google.github.io/googletest/"
-                "gmock_cook_book.html#NiceStrictNaggy");
-  NiceMock() : MockClass() {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  // Ideally, we would inherit base class's constructors through a using
-  // declaration, which would preserve their visibility. However, many existing
-  // tests rely on the fact that current implementation reexports protected
-  // constructors as public. These tests would need to be cleaned up first.
-
-  // Single argument constructor is special-cased so that it can be
-  // made explicit.
-  template <typename A>
-  explicit NiceMock(A&& arg) : MockClass(std::forward<A>(arg)) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  template <typename TArg1, typename TArg2, typename... An>
-  NiceMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
-      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
-                  std::forward<An>(args)...) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
- private:
-  NiceMock(const NiceMock&) = delete;
-  NiceMock& operator=(const NiceMock&) = delete;
-};
-
-template <class MockClass>
-class GTEST_INTERNAL_EMPTY_BASE_CLASS NaggyMock
-    : private internal::NaggyMockImpl<MockClass>,
-      public MockClass {
-  static_assert(!internal::HasStrictnessModifier<MockClass>(),
-                "Can't apply NaggyMock to a class hierarchy that already has a "
-                "strictness modifier. See "
-                "https://google.github.io/googletest/"
-                "gmock_cook_book.html#NiceStrictNaggy");
-
- public:
-  NaggyMock() : MockClass() {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  // Ideally, we would inherit base class's constructors through a using
-  // declaration, which would preserve their visibility. However, many existing
-  // tests rely on the fact that current implementation reexports protected
-  // constructors as public. These tests would need to be cleaned up first.
-
-  // Single argument constructor is special-cased so that it can be
-  // made explicit.
-  template <typename A>
-  explicit NaggyMock(A&& arg) : MockClass(std::forward<A>(arg)) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  template <typename TArg1, typename TArg2, typename... An>
-  NaggyMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
-      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
-                  std::forward<An>(args)...) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
- private:
-  NaggyMock(const NaggyMock&) = delete;
-  NaggyMock& operator=(const NaggyMock&) = delete;
-};
-
-template <class MockClass>
-class GTEST_INTERNAL_EMPTY_BASE_CLASS StrictMock
-    : private internal::StrictMockImpl<MockClass>,
-      public MockClass {
- public:
-  static_assert(
-      !internal::HasStrictnessModifier<MockClass>(),
-      "Can't apply StrictMock to a class hierarchy that already has a "
-      "strictness modifier. See "
-      "https://google.github.io/googletest/"
-      "gmock_cook_book.html#NiceStrictNaggy");
-  StrictMock() : MockClass() {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  // Ideally, we would inherit base class's constructors through a using
-  // declaration, which would preserve their visibility. However, many existing
-  // tests rely on the fact that current implementation reexports protected
-  // constructors as public. These tests would need to be cleaned up first.
-
-  // Single argument constructor is special-cased so that it can be
-  // made explicit.
-  template <typename A>
-  explicit StrictMock(A&& arg) : MockClass(std::forward<A>(arg)) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
-  template <typename TArg1, typename TArg2, typename... An>
-  StrictMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
-      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
-                  std::forward<An>(args)...) {
-    static_assert(sizeof(*this) == sizeof(MockClass),
-                  "The impl subclass shouldn't introduce any padding");
-  }
-
- private:
-  StrictMock(const StrictMock&) = delete;
-  StrictMock& operator=(const StrictMock&) = delete;
-};
-
-#undef GTEST_INTERNAL_EMPTY_BASE_CLASS
-
-}  // namespace testing
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h b/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
deleted file mode 100644
index 45cc605183..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
+++ /dev/null
@@ -1,2083 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements the ON_CALL() and EXPECT_CALL() macros.
-//
-// A user can use the ON_CALL() macro to specify the default action of
-// a mock method.  The syntax is:
-//
-//   ON_CALL(mock_object, Method(argument-matchers))
-//       .With(multi-argument-matcher)
-//       .WillByDefault(action);
-//
-//  where the .With() clause is optional.
-//
-// A user can use the EXPECT_CALL() macro to specify an expectation on
-// a mock method.  The syntax is:
-//
-//   EXPECT_CALL(mock_object, Method(argument-matchers))
-//       .With(multi-argument-matchers)
-//       .Times(cardinality)
-//       .InSequence(sequences)
-//       .After(expectations)
-//       .WillOnce(action)
-//       .WillRepeatedly(action)
-//       .RetiresOnSaturation();
-//
-// where all clauses are optional, and .InSequence()/.After()/
-// .WillOnce() can appear any number of times.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
-
-#include <cstdint>
-#include <functional>
-#include <map>
-#include <memory>
-#include <set>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gmock/gmock-actions.h"
-#include "gmock/gmock-cardinalities.h"
-#include "gmock/gmock-matchers.h"
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gmock/internal/gmock-port.h"
-#include "gtest/gtest.h"
-
-#if GTEST_HAS_EXCEPTIONS
-#include <stdexcept>  // NOLINT
-#endif
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// An abstract handle of an expectation.
-class Expectation;
-
-// A set of expectation handles.
-class ExpectationSet;
-
-// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
-// and MUST NOT BE USED IN USER CODE!!!
-namespace internal {
-
-// Implements a mock function.
-template <typename F>
-class FunctionMocker;
-
-// Base class for expectations.
-class ExpectationBase;
-
-// Implements an expectation.
-template <typename F>
-class TypedExpectation;
-
-// Helper class for testing the Expectation class template.
-class ExpectationTester;
-
-// Helper classes for implementing NiceMock, StrictMock, and NaggyMock.
-template <typename MockClass>
-class NiceMockImpl;
-template <typename MockClass>
-class StrictMockImpl;
-template <typename MockClass>
-class NaggyMockImpl;
-
-// Protects the mock object registry (in class Mock), all function
-// mockers, and all expectations.
-//
-// The reason we don't use more fine-grained protection is: when a
-// mock function Foo() is called, it needs to consult its expectations
-// to see which one should be picked.  If another thread is allowed to
-// call a mock function (either Foo() or a different one) at the same
-// time, it could affect the "retired" attributes of Foo()'s
-// expectations when InSequence() is used, and thus affect which
-// expectation gets picked.  Therefore, we sequence all mock function
-// calls to ensure the integrity of the mock objects' states.
-GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
-
-// Abstract base class of FunctionMocker.  This is the
-// type-agnostic part of the function mocker interface.  Its pure
-// virtual methods are implemented by FunctionMocker.
-class GTEST_API_ UntypedFunctionMockerBase {
- public:
-  UntypedFunctionMockerBase();
-  virtual ~UntypedFunctionMockerBase();
-
-  // Verifies that all expectations on this mock function have been
-  // satisfied.  Reports one or more Google Test non-fatal failures
-  // and returns false if not.
-  bool VerifyAndClearExpectationsLocked()
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
-
-  // Clears the ON_CALL()s set on this mock function.
-  virtual void ClearDefaultActionsLocked()
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
-
-  // In all of the following Untyped* functions, it's the caller's
-  // responsibility to guarantee the correctness of the arguments'
-  // types.
-
-  // Writes a message that the call is uninteresting (i.e. neither
-  // explicitly expected nor explicitly unexpected) to the given
-  // ostream.
-  virtual void UntypedDescribeUninterestingCall(const void* untyped_args,
-                                                ::std::ostream* os) const
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
-
-  // Returns the expectation that matches the given function arguments
-  // (or NULL is there's no match); when a match is found,
-  // untyped_action is set to point to the action that should be
-  // performed (or NULL if the action is "do default"), and
-  // is_excessive is modified to indicate whether the call exceeds the
-  // expected number.
-  virtual const ExpectationBase* UntypedFindMatchingExpectation(
-      const void* untyped_args, const void** untyped_action, bool* is_excessive,
-      ::std::ostream* what, ::std::ostream* why)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
-
-  // Prints the given function arguments to the ostream.
-  virtual void UntypedPrintArgs(const void* untyped_args,
-                                ::std::ostream* os) const = 0;
-
-  // Sets the mock object this mock method belongs to, and registers
-  // this information in the global mock registry.  Will be called
-  // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
-  // method.
-  void RegisterOwner(const void* mock_obj) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-
-  // Sets the mock object this mock method belongs to, and sets the
-  // name of the mock function.  Will be called upon each invocation
-  // of this mock function.
-  void SetOwnerAndName(const void* mock_obj, const char* name)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-
-  // Returns the mock object this mock method belongs to.  Must be
-  // called after RegisterOwner() or SetOwnerAndName() has been
-  // called.
-  const void* MockObject() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-
-  // Returns the name of this mock method.  Must be called after
-  // SetOwnerAndName() has been called.
-  const char* Name() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-
- protected:
-  typedef std::vector<const void*> UntypedOnCallSpecs;
-
-  using UntypedExpectations = std::vector<std::shared_ptr<ExpectationBase>>;
-
-  // Returns an Expectation object that references and co-owns exp,
-  // which must be an expectation on this mock function.
-  Expectation GetHandleOf(ExpectationBase* exp);
-
-  // Address of the mock object this mock method belongs to.  Only
-  // valid after this mock method has been called or
-  // ON_CALL/EXPECT_CALL has been invoked on it.
-  const void* mock_obj_;  // Protected by g_gmock_mutex.
-
-  // Name of the function being mocked.  Only valid after this mock
-  // method has been called.
-  const char* name_;  // Protected by g_gmock_mutex.
-
-  // All default action specs for this function mocker.
-  UntypedOnCallSpecs untyped_on_call_specs_;
-
-  // All expectations for this function mocker.
-  //
-  // It's undefined behavior to interleave expectations (EXPECT_CALLs
-  // or ON_CALLs) and mock function calls.  Also, the order of
-  // expectations is important.  Therefore it's a logic race condition
-  // to read/write untyped_expectations_ concurrently.  In order for
-  // tools like tsan to catch concurrent read/write accesses to
-  // untyped_expectations, we deliberately leave accesses to it
-  // unprotected.
-  UntypedExpectations untyped_expectations_;
-};  // class UntypedFunctionMockerBase
-
-// Untyped base class for OnCallSpec<F>.
-class UntypedOnCallSpecBase {
- public:
-  // The arguments are the location of the ON_CALL() statement.
-  UntypedOnCallSpecBase(const char* a_file, int a_line)
-      : file_(a_file), line_(a_line), last_clause_(kNone) {}
-
-  // Where in the source file was the default action spec defined?
-  const char* file() const { return file_; }
-  int line() const { return line_; }
-
- protected:
-  // Gives each clause in the ON_CALL() statement a name.
-  enum Clause {
-    // Do not change the order of the enum members!  The run-time
-    // syntax checking relies on it.
-    kNone,
-    kWith,
-    kWillByDefault
-  };
-
-  // Asserts that the ON_CALL() statement has a certain property.
-  void AssertSpecProperty(bool property,
-                          const std::string& failure_message) const {
-    Assert(property, file_, line_, failure_message);
-  }
-
-  // Expects that the ON_CALL() statement has a certain property.
-  void ExpectSpecProperty(bool property,
-                          const std::string& failure_message) const {
-    Expect(property, file_, line_, failure_message);
-  }
-
-  const char* file_;
-  int line_;
-
-  // The last clause in the ON_CALL() statement as seen so far.
-  // Initially kNone and changes as the statement is parsed.
-  Clause last_clause_;
-};  // class UntypedOnCallSpecBase
-
-// This template class implements an ON_CALL spec.
-template <typename F>
-class OnCallSpec : public UntypedOnCallSpecBase {
- public:
-  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
-  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
-
-  // Constructs an OnCallSpec object from the information inside
-  // the parenthesis of an ON_CALL() statement.
-  OnCallSpec(const char* a_file, int a_line,
-             const ArgumentMatcherTuple& matchers)
-      : UntypedOnCallSpecBase(a_file, a_line),
-        matchers_(matchers),
-        // By default, extra_matcher_ should match anything.  However,
-        // we cannot initialize it with _ as that causes ambiguity between
-        // Matcher's copy and move constructor for some argument types.
-        extra_matcher_(A<const ArgumentTuple&>()) {}
-
-  // Implements the .With() clause.
-  OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
-    // Makes sure this is called at most once.
-    ExpectSpecProperty(last_clause_ < kWith,
-                       ".With() cannot appear "
-                       "more than once in an ON_CALL().");
-    last_clause_ = kWith;
-
-    extra_matcher_ = m;
-    return *this;
-  }
-
-  // Implements the .WillByDefault() clause.
-  OnCallSpec& WillByDefault(const Action<F>& action) {
-    ExpectSpecProperty(last_clause_ < kWillByDefault,
-                       ".WillByDefault() must appear "
-                       "exactly once in an ON_CALL().");
-    last_clause_ = kWillByDefault;
-
-    ExpectSpecProperty(!action.IsDoDefault(),
-                       "DoDefault() cannot be used in ON_CALL().");
-    action_ = action;
-    return *this;
-  }
-
-  // Returns true if and only if the given arguments match the matchers.
-  bool Matches(const ArgumentTuple& args) const {
-    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
-  }
-
-  // Returns the action specified by the user.
-  const Action<F>& GetAction() const {
-    AssertSpecProperty(last_clause_ == kWillByDefault,
-                       ".WillByDefault() must appear exactly "
-                       "once in an ON_CALL().");
-    return action_;
-  }
-
- private:
-  // The information in statement
-  //
-  //   ON_CALL(mock_object, Method(matchers))
-  //       .With(multi-argument-matcher)
-  //       .WillByDefault(action);
-  //
-  // is recorded in the data members like this:
-  //
-  //   source file that contains the statement => file_
-  //   line number of the statement            => line_
-  //   matchers                                => matchers_
-  //   multi-argument-matcher                  => extra_matcher_
-  //   action                                  => action_
-  ArgumentMatcherTuple matchers_;
-  Matcher<const ArgumentTuple&> extra_matcher_;
-  Action<F> action_;
-};  // class OnCallSpec
-
-// Possible reactions on uninteresting calls.
-enum CallReaction {
-  kAllow,
-  kWarn,
-  kFail,
-};
-
-}  // namespace internal
-
-// Utilities for manipulating mock objects.
-class GTEST_API_ Mock {
- public:
-  // The following public methods can be called concurrently.
-
-  // Tells Google Mock to ignore mock_obj when checking for leaked
-  // mock objects.
-  static void AllowLeak(const void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Verifies and clears all expectations on the given mock object.
-  // If the expectations aren't satisfied, generates one or more
-  // Google Test non-fatal failures and returns false.
-  static bool VerifyAndClearExpectations(void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Verifies all expectations on the given mock object and clears its
-  // default actions and expectations.  Returns true if and only if the
-  // verification was successful.
-  static bool VerifyAndClear(void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Returns whether the mock was created as a naggy mock (default)
-  static bool IsNaggy(void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-  // Returns whether the mock was created as a nice mock
-  static bool IsNice(void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-  // Returns whether the mock was created as a strict mock
-  static bool IsStrict(void* mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
- private:
-  friend class internal::UntypedFunctionMockerBase;
-
-  // Needed for a function mocker to register itself (so that we know
-  // how to clear a mock object).
-  template <typename F>
-  friend class internal::FunctionMocker;
-
-  template <typename MockClass>
-  friend class internal::NiceMockImpl;
-  template <typename MockClass>
-  friend class internal::NaggyMockImpl;
-  template <typename MockClass>
-  friend class internal::StrictMockImpl;
-
-  // Tells Google Mock to allow uninteresting calls on the given mock
-  // object.
-  static void AllowUninterestingCalls(uintptr_t mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Tells Google Mock to warn the user about uninteresting calls on
-  // the given mock object.
-  static void WarnUninterestingCalls(uintptr_t mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Tells Google Mock to fail uninteresting calls on the given mock
-  // object.
-  static void FailUninterestingCalls(uintptr_t mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Tells Google Mock the given mock object is being destroyed and
-  // its entry in the call-reaction table should be removed.
-  static void UnregisterCallReaction(uintptr_t mock_obj)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Returns the reaction Google Mock will have on uninteresting calls
-  // made on the given mock object.
-  static internal::CallReaction GetReactionOnUninterestingCalls(
-      const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Verifies that all expectations on the given mock object have been
-  // satisfied.  Reports one or more Google Test non-fatal failures
-  // and returns false if not.
-  static bool VerifyAndClearExpectationsLocked(void* mock_obj)
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
-
-  // Clears all ON_CALL()s set on the given mock object.
-  static void ClearDefaultActionsLocked(void* mock_obj)
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
-
-  // Registers a mock object and a mock method it owns.
-  static void Register(const void* mock_obj,
-                       internal::UntypedFunctionMockerBase* mocker)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Tells Google Mock where in the source code mock_obj is used in an
-  // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
-  // information helps the user identify which object it is.
-  static void RegisterUseByOnCallOrExpectCall(const void* mock_obj,
-                                              const char* file, int line)
-      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
-
-  // Unregisters a mock method; removes the owning mock object from
-  // the registry when the last mock method associated with it has
-  // been unregistered.  This is called only in the destructor of
-  // FunctionMocker.
-  static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
-};  // class Mock
-
-// An abstract handle of an expectation.  Useful in the .After()
-// clause of EXPECT_CALL() for setting the (partial) order of
-// expectations.  The syntax:
-//
-//   Expectation e1 = EXPECT_CALL(...)...;
-//   EXPECT_CALL(...).After(e1)...;
-//
-// sets two expectations where the latter can only be matched after
-// the former has been satisfied.
-//
-// Notes:
-//   - This class is copyable and has value semantics.
-//   - Constness is shallow: a const Expectation object itself cannot
-//     be modified, but the mutable methods of the ExpectationBase
-//     object it references can be called via expectation_base().
-
-class GTEST_API_ Expectation {
- public:
-  // Constructs a null object that doesn't reference any expectation.
-  Expectation();
-  Expectation(Expectation&&) = default;
-  Expectation(const Expectation&) = default;
-  Expectation& operator=(Expectation&&) = default;
-  Expectation& operator=(const Expectation&) = default;
-  ~Expectation();
-
-  // This single-argument ctor must not be explicit, in order to support the
-  //   Expectation e = EXPECT_CALL(...);
-  // syntax.
-  //
-  // A TypedExpectation object stores its pre-requisites as
-  // Expectation objects, and needs to call the non-const Retire()
-  // method on the ExpectationBase objects they reference.  Therefore
-  // Expectation must receive a *non-const* reference to the
-  // ExpectationBase object.
-  Expectation(internal::ExpectationBase& exp);  // NOLINT
-
-  // The compiler-generated copy ctor and operator= work exactly as
-  // intended, so we don't need to define our own.
-
-  // Returns true if and only if rhs references the same expectation as this
-  // object does.
-  bool operator==(const Expectation& rhs) const {
-    return expectation_base_ == rhs.expectation_base_;
-  }
-
-  bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
-
- private:
-  friend class ExpectationSet;
-  friend class Sequence;
-  friend class ::testing::internal::ExpectationBase;
-  friend class ::testing::internal::UntypedFunctionMockerBase;
-
-  template <typename F>
-  friend class ::testing::internal::FunctionMocker;
-
-  template <typename F>
-  friend class ::testing::internal::TypedExpectation;
-
-  // This comparator is needed for putting Expectation objects into a set.
-  class Less {
-   public:
-    bool operator()(const Expectation& lhs, const Expectation& rhs) const {
-      return lhs.expectation_base_.get() < rhs.expectation_base_.get();
-    }
-  };
-
-  typedef ::std::set<Expectation, Less> Set;
-
-  Expectation(
-      const std::shared_ptr<internal::ExpectationBase>& expectation_base);
-
-  // Returns the expectation this object references.
-  const std::shared_ptr<internal::ExpectationBase>& expectation_base() const {
-    return expectation_base_;
-  }
-
-  // A shared_ptr that co-owns the expectation this handle references.
-  std::shared_ptr<internal::ExpectationBase> expectation_base_;
-};
-
-// A set of expectation handles.  Useful in the .After() clause of
-// EXPECT_CALL() for setting the (partial) order of expectations.  The
-// syntax:
-//
-//   ExpectationSet es;
-//   es += EXPECT_CALL(...)...;
-//   es += EXPECT_CALL(...)...;
-//   EXPECT_CALL(...).After(es)...;
-//
-// sets three expectations where the last one can only be matched
-// after the first two have both been satisfied.
-//
-// This class is copyable and has value semantics.
-class ExpectationSet {
- public:
-  // A bidirectional iterator that can read a const element in the set.
-  typedef Expectation::Set::const_iterator const_iterator;
-
-  // An object stored in the set.  This is an alias of Expectation.
-  typedef Expectation::Set::value_type value_type;
-
-  // Constructs an empty set.
-  ExpectationSet() {}
-
-  // This single-argument ctor must not be explicit, in order to support the
-  //   ExpectationSet es = EXPECT_CALL(...);
-  // syntax.
-  ExpectationSet(internal::ExpectationBase& exp) {  // NOLINT
-    *this += Expectation(exp);
-  }
-
-  // This single-argument ctor implements implicit conversion from
-  // Expectation and thus must not be explicit.  This allows either an
-  // Expectation or an ExpectationSet to be used in .After().
-  ExpectationSet(const Expectation& e) {  // NOLINT
-    *this += e;
-  }
-
-  // The compiler-generator ctor and operator= works exactly as
-  // intended, so we don't need to define our own.
-
-  // Returns true if and only if rhs contains the same set of Expectation
-  // objects as this does.
-  bool operator==(const ExpectationSet& rhs) const {
-    return expectations_ == rhs.expectations_;
-  }
-
-  bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
-
-  // Implements the syntax
-  //   expectation_set += EXPECT_CALL(...);
-  ExpectationSet& operator+=(const Expectation& e) {
-    expectations_.insert(e);
-    return *this;
-  }
-
-  int size() const { return static_cast<int>(expectations_.size()); }
-
-  const_iterator begin() const { return expectations_.begin(); }
-  const_iterator end() const { return expectations_.end(); }
-
- private:
-  Expectation::Set expectations_;
-};
-
-// Sequence objects are used by a user to specify the relative order
-// in which the expectations should match.  They are copyable (we rely
-// on the compiler-defined copy constructor and assignment operator).
-class GTEST_API_ Sequence {
- public:
-  // Constructs an empty sequence.
-  Sequence() : last_expectation_(new Expectation) {}
-
-  // Adds an expectation to this sequence.  The caller must ensure
-  // that no other thread is accessing this Sequence object.
-  void AddExpectation(const Expectation& expectation) const;
-
- private:
-  // The last expectation in this sequence.
-  std::shared_ptr<Expectation> last_expectation_;
-};  // class Sequence
-
-// An object of this type causes all EXPECT_CALL() statements
-// encountered in its scope to be put in an anonymous sequence.  The
-// work is done in the constructor and destructor.  You should only
-// create an InSequence object on the stack.
-//
-// The sole purpose for this class is to support easy definition of
-// sequential expectations, e.g.
-//
-//   {
-//     InSequence dummy;  // The name of the object doesn't matter.
-//
-//     // The following expectations must match in the order they appear.
-//     EXPECT_CALL(a, Bar())...;
-//     EXPECT_CALL(a, Baz())...;
-//     ...
-//     EXPECT_CALL(b, Xyz())...;
-//   }
-//
-// You can create InSequence objects in multiple threads, as long as
-// they are used to affect different mock objects.  The idea is that
-// each thread can create and set up its own mocks as if it's the only
-// thread.  However, for clarity of your tests we recommend you to set
-// up mocks in the main thread unless you have a good reason not to do
-// so.
-class GTEST_API_ InSequence {
- public:
-  InSequence();
-  ~InSequence();
-
- private:
-  bool sequence_created_;
-
-  InSequence(const InSequence&) = delete;
-  InSequence& operator=(const InSequence&) = delete;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-namespace internal {
-
-// Points to the implicit sequence introduced by a living InSequence
-// object (if any) in the current thread or NULL.
-GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
-
-// Base class for implementing expectations.
-//
-// There are two reasons for having a type-agnostic base class for
-// Expectation:
-//
-//   1. We need to store collections of expectations of different
-//   types (e.g. all pre-requisites of a particular expectation, all
-//   expectations in a sequence).  Therefore these expectation objects
-//   must share a common base class.
-//
-//   2. We can avoid binary code bloat by moving methods not depending
-//   on the template argument of Expectation to the base class.
-//
-// This class is internal and mustn't be used by user code directly.
-class GTEST_API_ ExpectationBase {
- public:
-  // source_text is the EXPECT_CALL(...) source that created this Expectation.
-  ExpectationBase(const char* file, int line, const std::string& source_text);
-
-  virtual ~ExpectationBase();
-
-  // Where in the source file was the expectation spec defined?
-  const char* file() const { return file_; }
-  int line() const { return line_; }
-  const char* source_text() const { return source_text_.c_str(); }
-  // Returns the cardinality specified in the expectation spec.
-  const Cardinality& cardinality() const { return cardinality_; }
-
-  // Describes the source file location of this expectation.
-  void DescribeLocationTo(::std::ostream* os) const {
-    *os << FormatFileLocation(file(), line()) << " ";
-  }
-
-  // Describes how many times a function call matching this
-  // expectation has occurred.
-  void DescribeCallCountTo(::std::ostream* os) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
-
-  // If this mock method has an extra matcher (i.e. .With(matcher)),
-  // describes it to the ostream.
-  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
-
- protected:
-  friend class ::testing::Expectation;
-  friend class UntypedFunctionMockerBase;
-
-  enum Clause {
-    // Don't change the order of the enum members!
-    kNone,
-    kWith,
-    kTimes,
-    kInSequence,
-    kAfter,
-    kWillOnce,
-    kWillRepeatedly,
-    kRetiresOnSaturation
-  };
-
-  typedef std::vector<const void*> UntypedActions;
-
-  // Returns an Expectation object that references and co-owns this
-  // expectation.
-  virtual Expectation GetHandle() = 0;
-
-  // Asserts that the EXPECT_CALL() statement has the given property.
-  void AssertSpecProperty(bool property,
-                          const std::string& failure_message) const {
-    Assert(property, file_, line_, failure_message);
-  }
-
-  // Expects that the EXPECT_CALL() statement has the given property.
-  void ExpectSpecProperty(bool property,
-                          const std::string& failure_message) const {
-    Expect(property, file_, line_, failure_message);
-  }
-
-  // Explicitly specifies the cardinality of this expectation.  Used
-  // by the subclasses to implement the .Times() clause.
-  void SpecifyCardinality(const Cardinality& cardinality);
-
-  // Returns true if and only if the user specified the cardinality
-  // explicitly using a .Times().
-  bool cardinality_specified() const { return cardinality_specified_; }
-
-  // Sets the cardinality of this expectation spec.
-  void set_cardinality(const Cardinality& a_cardinality) {
-    cardinality_ = a_cardinality;
-  }
-
-  // The following group of methods should only be called after the
-  // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
-  // the current thread.
-
-  // Retires all pre-requisites of this expectation.
-  void RetireAllPreRequisites() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
-
-  // Returns true if and only if this expectation is retired.
-  bool is_retired() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return retired_;
-  }
-
-  // Retires this expectation.
-  void Retire() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    retired_ = true;
-  }
-
-  // Returns true if and only if this expectation is satisfied.
-  bool IsSatisfied() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return cardinality().IsSatisfiedByCallCount(call_count_);
-  }
-
-  // Returns true if and only if this expectation is saturated.
-  bool IsSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return cardinality().IsSaturatedByCallCount(call_count_);
-  }
-
-  // Returns true if and only if this expectation is over-saturated.
-  bool IsOverSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return cardinality().IsOverSaturatedByCallCount(call_count_);
-  }
-
-  // Returns true if and only if all pre-requisites of this expectation are
-  // satisfied.
-  bool AllPrerequisitesAreSatisfied() const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
-
-  // Adds unsatisfied pre-requisites of this expectation to 'result'.
-  void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
-
-  // Returns the number this expectation has been invoked.
-  int call_count() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return call_count_;
-  }
-
-  // Increments the number this expectation has been invoked.
-  void IncrementCallCount() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    call_count_++;
-  }
-
-  // Checks the action count (i.e. the number of WillOnce() and
-  // WillRepeatedly() clauses) against the cardinality if this hasn't
-  // been done before.  Prints a warning if there are too many or too
-  // few actions.
-  void CheckActionCountIfNotDone() const GTEST_LOCK_EXCLUDED_(mutex_);
-
-  friend class ::testing::Sequence;
-  friend class ::testing::internal::ExpectationTester;
-
-  template <typename Function>
-  friend class TypedExpectation;
-
-  // Implements the .Times() clause.
-  void UntypedTimes(const Cardinality& a_cardinality);
-
-  // This group of fields are part of the spec and won't change after
-  // an EXPECT_CALL() statement finishes.
-  const char* file_;               // The file that contains the expectation.
-  int line_;                       // The line number of the expectation.
-  const std::string source_text_;  // The EXPECT_CALL(...) source text.
-  // True if and only if the cardinality is specified explicitly.
-  bool cardinality_specified_;
-  Cardinality cardinality_;  // The cardinality of the expectation.
-  // The immediate pre-requisites (i.e. expectations that must be
-  // satisfied before this expectation can be matched) of this
-  // expectation.  We use std::shared_ptr in the set because we want an
-  // Expectation object to be co-owned by its FunctionMocker and its
-  // successors.  This allows multiple mock objects to be deleted at
-  // different times.
-  ExpectationSet immediate_prerequisites_;
-
-  // This group of fields are the current state of the expectation,
-  // and can change as the mock function is called.
-  int call_count_;  // How many times this expectation has been invoked.
-  bool retired_;    // True if and only if this expectation has retired.
-  UntypedActions untyped_actions_;
-  bool extra_matcher_specified_;
-  bool repeated_action_specified_;  // True if a WillRepeatedly() was specified.
-  bool retires_on_saturation_;
-  Clause last_clause_;
-  mutable bool action_count_checked_;  // Under mutex_.
-  mutable Mutex mutex_;                // Protects action_count_checked_.
-};                                     // class ExpectationBase
-
-template <typename F>
-class TypedExpectation;
-
-// Implements an expectation for the given function type.
-template <typename R, typename... Args>
-class TypedExpectation<R(Args...)> : public ExpectationBase {
- private:
-  using F = R(Args...);
-
- public:
-  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
-  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
-  typedef typename Function<F>::Result Result;
-
-  TypedExpectation(FunctionMocker<F>* owner, const char* a_file, int a_line,
-                   const std::string& a_source_text,
-                   const ArgumentMatcherTuple& m)
-      : ExpectationBase(a_file, a_line, a_source_text),
-        owner_(owner),
-        matchers_(m),
-        // By default, extra_matcher_ should match anything.  However,
-        // we cannot initialize it with _ as that causes ambiguity between
-        // Matcher's copy and move constructor for some argument types.
-        extra_matcher_(A<const ArgumentTuple&>()),
-        repeated_action_(DoDefault()) {}
-
-  ~TypedExpectation() override {
-    // Check the validity of the action count if it hasn't been done
-    // yet (for example, if the expectation was never used).
-    CheckActionCountIfNotDone();
-    for (UntypedActions::const_iterator it = untyped_actions_.begin();
-         it != untyped_actions_.end(); ++it) {
-      delete static_cast<const Action<F>*>(*it);
-    }
-  }
-
-  // Implements the .With() clause.
-  TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
-    if (last_clause_ == kWith) {
-      ExpectSpecProperty(false,
-                         ".With() cannot appear "
-                         "more than once in an EXPECT_CALL().");
-    } else {
-      ExpectSpecProperty(last_clause_ < kWith,
-                         ".With() must be the first "
-                         "clause in an EXPECT_CALL().");
-    }
-    last_clause_ = kWith;
-
-    extra_matcher_ = m;
-    extra_matcher_specified_ = true;
-    return *this;
-  }
-
-  // Implements the .Times() clause.
-  TypedExpectation& Times(const Cardinality& a_cardinality) {
-    ExpectationBase::UntypedTimes(a_cardinality);
-    return *this;
-  }
-
-  // Implements the .Times() clause.
-  TypedExpectation& Times(int n) { return Times(Exactly(n)); }
-
-  // Implements the .InSequence() clause.
-  TypedExpectation& InSequence(const Sequence& s) {
-    ExpectSpecProperty(last_clause_ <= kInSequence,
-                       ".InSequence() cannot appear after .After(),"
-                       " .WillOnce(), .WillRepeatedly(), or "
-                       ".RetiresOnSaturation().");
-    last_clause_ = kInSequence;
-
-    s.AddExpectation(GetHandle());
-    return *this;
-  }
-  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
-    return InSequence(s1).InSequence(s2);
-  }
-  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
-                               const Sequence& s3) {
-    return InSequence(s1, s2).InSequence(s3);
-  }
-  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
-                               const Sequence& s3, const Sequence& s4) {
-    return InSequence(s1, s2, s3).InSequence(s4);
-  }
-  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
-                               const Sequence& s3, const Sequence& s4,
-                               const Sequence& s5) {
-    return InSequence(s1, s2, s3, s4).InSequence(s5);
-  }
-
-  // Implements that .After() clause.
-  TypedExpectation& After(const ExpectationSet& s) {
-    ExpectSpecProperty(last_clause_ <= kAfter,
-                       ".After() cannot appear after .WillOnce(),"
-                       " .WillRepeatedly(), or "
-                       ".RetiresOnSaturation().");
-    last_clause_ = kAfter;
-
-    for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
-      immediate_prerequisites_ += *it;
-    }
-    return *this;
-  }
-  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
-    return After(s1).After(s2);
-  }
-  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
-                          const ExpectationSet& s3) {
-    return After(s1, s2).After(s3);
-  }
-  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
-                          const ExpectationSet& s3, const ExpectationSet& s4) {
-    return After(s1, s2, s3).After(s4);
-  }
-  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
-                          const ExpectationSet& s3, const ExpectationSet& s4,
-                          const ExpectationSet& s5) {
-    return After(s1, s2, s3, s4).After(s5);
-  }
-
-  // Preferred, type-safe overload: consume anything that can be directly
-  // converted to a OnceAction, except for Action<F> objects themselves.
-  TypedExpectation& WillOnce(OnceAction<F> once_action) {
-    // Call the overload below, smuggling the OnceAction as a copyable callable.
-    // We know this is safe because a WillOnce action will not be called more
-    // than once.
-    return WillOnce(Action<F>(ActionAdaptor{
-        std::make_shared<OnceAction<F>>(std::move(once_action)),
-    }));
-  }
-
-  // Fallback overload: accept Action<F> objects and those actions that define
-  // `operator Action<F>` but not `operator OnceAction<F>`.
-  //
-  // This is templated in order to cause the overload above to be preferred
-  // when the input is convertible to either type.
-  template <int&... ExplicitArgumentBarrier, typename = void>
-  TypedExpectation& WillOnce(Action<F> action) {
-    ExpectSpecProperty(last_clause_ <= kWillOnce,
-                       ".WillOnce() cannot appear after "
-                       ".WillRepeatedly() or .RetiresOnSaturation().");
-    last_clause_ = kWillOnce;
-
-    untyped_actions_.push_back(new Action<F>(std::move(action)));
-
-    if (!cardinality_specified()) {
-      set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
-    }
-    return *this;
-  }
-
-  // Implements the .WillRepeatedly() clause.
-  TypedExpectation& WillRepeatedly(const Action<F>& action) {
-    if (last_clause_ == kWillRepeatedly) {
-      ExpectSpecProperty(false,
-                         ".WillRepeatedly() cannot appear "
-                         "more than once in an EXPECT_CALL().");
-    } else {
-      ExpectSpecProperty(last_clause_ < kWillRepeatedly,
-                         ".WillRepeatedly() cannot appear "
-                         "after .RetiresOnSaturation().");
-    }
-    last_clause_ = kWillRepeatedly;
-    repeated_action_specified_ = true;
-
-    repeated_action_ = action;
-    if (!cardinality_specified()) {
-      set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
-    }
-
-    // Now that no more action clauses can be specified, we check
-    // whether their count makes sense.
-    CheckActionCountIfNotDone();
-    return *this;
-  }
-
-  // Implements the .RetiresOnSaturation() clause.
-  TypedExpectation& RetiresOnSaturation() {
-    ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
-                       ".RetiresOnSaturation() cannot appear "
-                       "more than once.");
-    last_clause_ = kRetiresOnSaturation;
-    retires_on_saturation_ = true;
-
-    // Now that no more action clauses can be specified, we check
-    // whether their count makes sense.
-    CheckActionCountIfNotDone();
-    return *this;
-  }
-
-  // Returns the matchers for the arguments as specified inside the
-  // EXPECT_CALL() macro.
-  const ArgumentMatcherTuple& matchers() const { return matchers_; }
-
-  // Returns the matcher specified by the .With() clause.
-  const Matcher<const ArgumentTuple&>& extra_matcher() const {
-    return extra_matcher_;
-  }
-
-  // Returns the action specified by the .WillRepeatedly() clause.
-  const Action<F>& repeated_action() const { return repeated_action_; }
-
-  // If this mock method has an extra matcher (i.e. .With(matcher)),
-  // describes it to the ostream.
-  void MaybeDescribeExtraMatcherTo(::std::ostream* os) override {
-    if (extra_matcher_specified_) {
-      *os << "    Expected args: ";
-      extra_matcher_.DescribeTo(os);
-      *os << "\n";
-    }
-  }
-
- private:
-  template <typename Function>
-  friend class FunctionMocker;
-
-  // An adaptor that turns a OneAction<F> into something compatible with
-  // Action<F>. Must be called at most once.
-  struct ActionAdaptor {
-    std::shared_ptr<OnceAction<R(Args...)>> once_action;
-
-    R operator()(Args&&... args) const {
-      return std::move(*once_action).Call(std::forward<Args>(args)...);
-    }
-  };
-
-  // Returns an Expectation object that references and co-owns this
-  // expectation.
-  Expectation GetHandle() override { return owner_->GetHandleOf(this); }
-
-  // The following methods will be called only after the EXPECT_CALL()
-  // statement finishes and when the current thread holds
-  // g_gmock_mutex.
-
-  // Returns true if and only if this expectation matches the given arguments.
-  bool Matches(const ArgumentTuple& args) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
-  }
-
-  // Returns true if and only if this expectation should handle the given
-  // arguments.
-  bool ShouldHandleArguments(const ArgumentTuple& args) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-
-    // In case the action count wasn't checked when the expectation
-    // was defined (e.g. if this expectation has no WillRepeatedly()
-    // or RetiresOnSaturation() clause), we check it when the
-    // expectation is used for the first time.
-    CheckActionCountIfNotDone();
-    return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
-  }
-
-  // Describes the result of matching the arguments against this
-  // expectation to the given ostream.
-  void ExplainMatchResultTo(const ArgumentTuple& args, ::std::ostream* os) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-
-    if (is_retired()) {
-      *os << "         Expected: the expectation is active\n"
-          << "           Actual: it is retired\n";
-    } else if (!Matches(args)) {
-      if (!TupleMatches(matchers_, args)) {
-        ExplainMatchFailureTupleTo(matchers_, args, os);
-      }
-      StringMatchResultListener listener;
-      if (!extra_matcher_.MatchAndExplain(args, &listener)) {
-        *os << "    Expected args: ";
-        extra_matcher_.DescribeTo(os);
-        *os << "\n           Actual: don't match";
-
-        internal::PrintIfNotEmpty(listener.str(), os);
-        *os << "\n";
-      }
-    } else if (!AllPrerequisitesAreSatisfied()) {
-      *os << "         Expected: all pre-requisites are satisfied\n"
-          << "           Actual: the following immediate pre-requisites "
-          << "are not satisfied:\n";
-      ExpectationSet unsatisfied_prereqs;
-      FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
-      int i = 0;
-      for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
-           it != unsatisfied_prereqs.end(); ++it) {
-        it->expectation_base()->DescribeLocationTo(os);
-        *os << "pre-requisite #" << i++ << "\n";
-      }
-      *os << "                   (end of pre-requisites)\n";
-    } else {
-      // This line is here just for completeness' sake.  It will never
-      // be executed as currently the ExplainMatchResultTo() function
-      // is called only when the mock function call does NOT match the
-      // expectation.
-      *os << "The call matches the expectation.\n";
-    }
-  }
-
-  // Returns the action that should be taken for the current invocation.
-  const Action<F>& GetCurrentAction(const FunctionMocker<F>* mocker,
-                                    const ArgumentTuple& args) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    const int count = call_count();
-    Assert(count >= 1, __FILE__, __LINE__,
-           "call_count() is <= 0 when GetCurrentAction() is "
-           "called - this should never happen.");
-
-    const int action_count = static_cast<int>(untyped_actions_.size());
-    if (action_count > 0 && !repeated_action_specified_ &&
-        count > action_count) {
-      // If there is at least one WillOnce() and no WillRepeatedly(),
-      // we warn the user when the WillOnce() clauses ran out.
-      ::std::stringstream ss;
-      DescribeLocationTo(&ss);
-      ss << "Actions ran out in " << source_text() << "...\n"
-         << "Called " << count << " times, but only " << action_count
-         << " WillOnce()" << (action_count == 1 ? " is" : "s are")
-         << " specified - ";
-      mocker->DescribeDefaultActionTo(args, &ss);
-      Log(kWarning, ss.str(), 1);
-    }
-
-    return count <= action_count
-               ? *static_cast<const Action<F>*>(
-                     untyped_actions_[static_cast<size_t>(count - 1)])
-               : repeated_action();
-  }
-
-  // Given the arguments of a mock function call, if the call will
-  // over-saturate this expectation, returns the default action;
-  // otherwise, returns the next action in this expectation.  Also
-  // describes *what* happened to 'what', and explains *why* Google
-  // Mock does it to 'why'.  This method is not const as it calls
-  // IncrementCallCount().  A return value of NULL means the default
-  // action.
-  const Action<F>* GetActionForArguments(const FunctionMocker<F>* mocker,
-                                         const ArgumentTuple& args,
-                                         ::std::ostream* what,
-                                         ::std::ostream* why)
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    if (IsSaturated()) {
-      // We have an excessive call.
-      IncrementCallCount();
-      *what << "Mock function called more times than expected - ";
-      mocker->DescribeDefaultActionTo(args, what);
-      DescribeCallCountTo(why);
-
-      return nullptr;
-    }
-
-    IncrementCallCount();
-    RetireAllPreRequisites();
-
-    if (retires_on_saturation_ && IsSaturated()) {
-      Retire();
-    }
-
-    // Must be done after IncrementCount()!
-    *what << "Mock function call matches " << source_text() << "...\n";
-    return &(GetCurrentAction(mocker, args));
-  }
-
-  // All the fields below won't change once the EXPECT_CALL()
-  // statement finishes.
-  FunctionMocker<F>* const owner_;
-  ArgumentMatcherTuple matchers_;
-  Matcher<const ArgumentTuple&> extra_matcher_;
-  Action<F> repeated_action_;
-
-  TypedExpectation(const TypedExpectation&) = delete;
-  TypedExpectation& operator=(const TypedExpectation&) = delete;
-};  // class TypedExpectation
-
-// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
-// specifying the default behavior of, or expectation on, a mock
-// function.
-
-// Note: class MockSpec really belongs to the ::testing namespace.
-// However if we define it in ::testing, MSVC will complain when
-// classes in ::testing::internal declare it as a friend class
-// template.  To workaround this compiler bug, we define MockSpec in
-// ::testing::internal and import it into ::testing.
-
-// Logs a message including file and line number information.
-GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
-                                const char* file, int line,
-                                const std::string& message);
-
-template <typename F>
-class MockSpec {
- public:
-  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
-  typedef
-      typename internal::Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
-
-  // Constructs a MockSpec object, given the function mocker object
-  // that the spec is associated with.
-  MockSpec(internal::FunctionMocker<F>* function_mocker,
-           const ArgumentMatcherTuple& matchers)
-      : function_mocker_(function_mocker), matchers_(matchers) {}
-
-  // Adds a new default action spec to the function mocker and returns
-  // the newly created spec.
-  internal::OnCallSpec<F>& InternalDefaultActionSetAt(const char* file,
-                                                      int line, const char* obj,
-                                                      const char* call) {
-    LogWithLocation(internal::kInfo, file, line,
-                    std::string("ON_CALL(") + obj + ", " + call + ") invoked");
-    return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
-  }
-
-  // Adds a new expectation spec to the function mocker and returns
-  // the newly created spec.
-  internal::TypedExpectation<F>& InternalExpectedAt(const char* file, int line,
-                                                    const char* obj,
-                                                    const char* call) {
-    const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
-                                  call + ")");
-    LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
-    return function_mocker_->AddNewExpectation(file, line, source_text,
-                                               matchers_);
-  }
-
-  // This operator overload is used to swallow the superfluous parameter list
-  // introduced by the ON/EXPECT_CALL macros. See the macro comments for more
-  // explanation.
-  MockSpec<F>& operator()(const internal::WithoutMatchers&, void* const) {
-    return *this;
-  }
-
- private:
-  template <typename Function>
-  friend class internal::FunctionMocker;
-
-  // The function mocker that owns this spec.
-  internal::FunctionMocker<F>* const function_mocker_;
-  // The argument matchers specified in the spec.
-  ArgumentMatcherTuple matchers_;
-};  // class MockSpec
-
-// Wrapper type for generically holding an ordinary value or lvalue reference.
-// If T is not a reference type, it must be copyable or movable.
-// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
-// T is a move-only value type (which means that it will always be copyable
-// if the current platform does not support move semantics).
-//
-// The primary template defines handling for values, but function header
-// comments describe the contract for the whole template (including
-// specializations).
-template <typename T>
-class ReferenceOrValueWrapper {
- public:
-  // Constructs a wrapper from the given value/reference.
-  explicit ReferenceOrValueWrapper(T value) : value_(std::move(value)) {}
-
-  // Unwraps and returns the underlying value/reference, exactly as
-  // originally passed. The behavior of calling this more than once on
-  // the same object is unspecified.
-  T Unwrap() { return std::move(value_); }
-
-  // Provides nondestructive access to the underlying value/reference.
-  // Always returns a const reference (more precisely,
-  // const std::add_lvalue_reference<T>::type). The behavior of calling this
-  // after calling Unwrap on the same object is unspecified.
-  const T& Peek() const { return value_; }
-
- private:
-  T value_;
-};
-
-// Specialization for lvalue reference types. See primary template
-// for documentation.
-template <typename T>
-class ReferenceOrValueWrapper<T&> {
- public:
-  // Workaround for debatable pass-by-reference lint warning (c-library-team
-  // policy precludes NOLINT in this context)
-  typedef T& reference;
-  explicit ReferenceOrValueWrapper(reference ref) : value_ptr_(&ref) {}
-  T& Unwrap() { return *value_ptr_; }
-  const T& Peek() const { return *value_ptr_; }
-
- private:
-  T* value_ptr_;
-};
-
-// Prints the held value as an action's result to os.
-template <typename T>
-void PrintAsActionResult(const T& result, std::ostream& os) {
-  os << "\n          Returns: ";
-  // T may be a reference type, so we don't use UniversalPrint().
-  UniversalPrinter<T>::Print(result, &os);
-}
-
-// Reports an uninteresting call (whose description is in msg) in the
-// manner specified by 'reaction'.
-GTEST_API_ void ReportUninterestingCall(CallReaction reaction,
-                                        const std::string& msg);
-
-// A generic RAII type that runs a user-provided function in its destructor.
-class Cleanup final {
- public:
-  explicit Cleanup(std::function<void()> f) : f_(std::move(f)) {}
-  ~Cleanup() { f_(); }
-
- private:
-  std::function<void()> f_;
-};
-
-template <typename F>
-class FunctionMocker;
-
-template <typename R, typename... Args>
-class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
-  using F = R(Args...);
-
- public:
-  using Result = R;
-  using ArgumentTuple = std::tuple<Args...>;
-  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
-
-  FunctionMocker() {}
-
-  // There is no generally useful and implementable semantics of
-  // copying a mock object, so copying a mock is usually a user error.
-  // Thus we disallow copying function mockers.  If the user really
-  // wants to copy a mock object, they should implement their own copy
-  // operation, for example:
-  //
-  //   class MockFoo : public Foo {
-  //    public:
-  //     // Defines a copy constructor explicitly.
-  //     MockFoo(const MockFoo& src) {}
-  //     ...
-  //   };
-  FunctionMocker(const FunctionMocker&) = delete;
-  FunctionMocker& operator=(const FunctionMocker&) = delete;
-
-  // The destructor verifies that all expectations on this mock
-  // function have been satisfied.  If not, it will report Google Test
-  // non-fatal failures for the violations.
-  ~FunctionMocker() override GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    MutexLock l(&g_gmock_mutex);
-    VerifyAndClearExpectationsLocked();
-    Mock::UnregisterLocked(this);
-    ClearDefaultActionsLocked();
-  }
-
-  // Returns the ON_CALL spec that matches this mock function with the
-  // given arguments; returns NULL if no matching ON_CALL is found.
-  // L = *
-  const OnCallSpec<F>* FindOnCallSpec(const ArgumentTuple& args) const {
-    for (UntypedOnCallSpecs::const_reverse_iterator it =
-             untyped_on_call_specs_.rbegin();
-         it != untyped_on_call_specs_.rend(); ++it) {
-      const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
-      if (spec->Matches(args)) return spec;
-    }
-
-    return nullptr;
-  }
-
-  // Performs the default action of this mock function on the given
-  // arguments and returns the result. Asserts (or throws if
-  // exceptions are enabled) with a helpful call description if there
-  // is no valid return value. This method doesn't depend on the
-  // mutable state of this object, and thus can be called concurrently
-  // without locking.
-  // L = *
-  Result PerformDefaultAction(ArgumentTuple&& args,
-                              const std::string& call_description) const {
-    const OnCallSpec<F>* const spec = this->FindOnCallSpec(args);
-    if (spec != nullptr) {
-      return spec->GetAction().Perform(std::move(args));
-    }
-    const std::string message =
-        call_description +
-        "\n    The mock function has no default action "
-        "set, and its return type has no default value set.";
-#if GTEST_HAS_EXCEPTIONS
-    if (!DefaultValue<Result>::Exists()) {
-      throw std::runtime_error(message);
-    }
-#else
-    Assert(DefaultValue<Result>::Exists(), "", -1, message);
-#endif
-    return DefaultValue<Result>::Get();
-  }
-
-  // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
-  // clears the ON_CALL()s set on this mock function.
-  void ClearDefaultActionsLocked() override
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-
-    // Deleting our default actions may trigger other mock objects to be
-    // deleted, for example if an action contains a reference counted smart
-    // pointer to that mock object, and that is the last reference. So if we
-    // delete our actions within the context of the global mutex we may deadlock
-    // when this method is called again. Instead, make a copy of the set of
-    // actions to delete, clear our set within the mutex, and then delete the
-    // actions outside of the mutex.
-    UntypedOnCallSpecs specs_to_delete;
-    untyped_on_call_specs_.swap(specs_to_delete);
-
-    g_gmock_mutex.Unlock();
-    for (UntypedOnCallSpecs::const_iterator it = specs_to_delete.begin();
-         it != specs_to_delete.end(); ++it) {
-      delete static_cast<const OnCallSpec<F>*>(*it);
-    }
-
-    // Lock the mutex again, since the caller expects it to be locked when we
-    // return.
-    g_gmock_mutex.Lock();
-  }
-
-  // Returns the result of invoking this mock function with the given
-  // arguments.  This function can be safely called from multiple
-  // threads concurrently.
-  Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    return InvokeWith(ArgumentTuple(std::forward<Args>(args)...));
-  }
-
-  MockSpec<F> With(Matcher<Args>... m) {
-    return MockSpec<F>(this, ::std::make_tuple(std::move(m)...));
-  }
-
- protected:
-  template <typename Function>
-  friend class MockSpec;
-
-  // Adds and returns a default action spec for this mock function.
-  OnCallSpec<F>& AddNewOnCallSpec(const char* file, int line,
-                                  const ArgumentMatcherTuple& m)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
-    OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
-    untyped_on_call_specs_.push_back(on_call_spec);
-    return *on_call_spec;
-  }
-
-  // Adds and returns an expectation spec for this mock function.
-  TypedExpectation<F>& AddNewExpectation(const char* file, int line,
-                                         const std::string& source_text,
-                                         const ArgumentMatcherTuple& m)
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
-    TypedExpectation<F>* const expectation =
-        new TypedExpectation<F>(this, file, line, source_text, m);
-    const std::shared_ptr<ExpectationBase> untyped_expectation(expectation);
-    // See the definition of untyped_expectations_ for why access to
-    // it is unprotected here.
-    untyped_expectations_.push_back(untyped_expectation);
-
-    // Adds this expectation into the implicit sequence if there is one.
-    Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
-    if (implicit_sequence != nullptr) {
-      implicit_sequence->AddExpectation(Expectation(untyped_expectation));
-    }
-
-    return *expectation;
-  }
-
- private:
-  template <typename Func>
-  friend class TypedExpectation;
-
-  // Some utilities needed for implementing UntypedInvokeWith().
-
-  // Describes what default action will be performed for the given
-  // arguments.
-  // L = *
-  void DescribeDefaultActionTo(const ArgumentTuple& args,
-                               ::std::ostream* os) const {
-    const OnCallSpec<F>* const spec = FindOnCallSpec(args);
-
-    if (spec == nullptr) {
-      *os << (std::is_void<Result>::value ? "returning directly.\n"
-                                          : "returning default value.\n");
-    } else {
-      *os << "taking default action specified at:\n"
-          << FormatFileLocation(spec->file(), spec->line()) << "\n";
-    }
-  }
-
-  // Writes a message that the call is uninteresting (i.e. neither
-  // explicitly expected nor explicitly unexpected) to the given
-  // ostream.
-  void UntypedDescribeUninterestingCall(const void* untyped_args,
-                                        ::std::ostream* os) const override
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    const ArgumentTuple& args =
-        *static_cast<const ArgumentTuple*>(untyped_args);
-    *os << "Uninteresting mock function call - ";
-    DescribeDefaultActionTo(args, os);
-    *os << "    Function call: " << Name();
-    UniversalPrint(args, os);
-  }
-
-  // Returns the expectation that matches the given function arguments
-  // (or NULL is there's no match); when a match is found,
-  // untyped_action is set to point to the action that should be
-  // performed (or NULL if the action is "do default"), and
-  // is_excessive is modified to indicate whether the call exceeds the
-  // expected number.
-  //
-  // Critical section: We must find the matching expectation and the
-  // corresponding action that needs to be taken in an ATOMIC
-  // transaction.  Otherwise another thread may call this mock
-  // method in the middle and mess up the state.
-  //
-  // However, performing the action has to be left out of the critical
-  // section.  The reason is that we have no control on what the
-  // action does (it can invoke an arbitrary user function or even a
-  // mock function) and excessive locking could cause a dead lock.
-  const ExpectationBase* UntypedFindMatchingExpectation(
-      const void* untyped_args, const void** untyped_action, bool* is_excessive,
-      ::std::ostream* what, ::std::ostream* why) override
-      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-    const ArgumentTuple& args =
-        *static_cast<const ArgumentTuple*>(untyped_args);
-    MutexLock l(&g_gmock_mutex);
-    TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
-    if (exp == nullptr) {  // A match wasn't found.
-      this->FormatUnexpectedCallMessageLocked(args, what, why);
-      return nullptr;
-    }
-
-    // This line must be done before calling GetActionForArguments(),
-    // which will increment the call count for *exp and thus affect
-    // its saturation status.
-    *is_excessive = exp->IsSaturated();
-    const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
-    if (action != nullptr && action->IsDoDefault())
-      action = nullptr;  // Normalize "do default" to NULL.
-    *untyped_action = action;
-    return exp;
-  }
-
-  // Prints the given function arguments to the ostream.
-  void UntypedPrintArgs(const void* untyped_args,
-                        ::std::ostream* os) const override {
-    const ArgumentTuple& args =
-        *static_cast<const ArgumentTuple*>(untyped_args);
-    UniversalPrint(args, os);
-  }
-
-  // Returns the expectation that matches the arguments, or NULL if no
-  // expectation matches them.
-  TypedExpectation<F>* FindMatchingExpectationLocked(const ArgumentTuple& args)
-      const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    // See the definition of untyped_expectations_ for why access to
-    // it is unprotected here.
-    for (typename UntypedExpectations::const_reverse_iterator it =
-             untyped_expectations_.rbegin();
-         it != untyped_expectations_.rend(); ++it) {
-      TypedExpectation<F>* const exp =
-          static_cast<TypedExpectation<F>*>(it->get());
-      if (exp->ShouldHandleArguments(args)) {
-        return exp;
-      }
-    }
-    return nullptr;
-  }
-
-  // Returns a message that the arguments don't match any expectation.
-  void FormatUnexpectedCallMessageLocked(const ArgumentTuple& args,
-                                         ::std::ostream* os,
-                                         ::std::ostream* why) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    *os << "\nUnexpected mock function call - ";
-    DescribeDefaultActionTo(args, os);
-    PrintTriedExpectationsLocked(args, why);
-  }
-
-  // Prints a list of expectations that have been tried against the
-  // current mock function call.
-  void PrintTriedExpectationsLocked(const ArgumentTuple& args,
-                                    ::std::ostream* why) const
-      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-    g_gmock_mutex.AssertHeld();
-    const size_t count = untyped_expectations_.size();
-    *why << "Google Mock tried the following " << count << " "
-         << (count == 1 ? "expectation, but it didn't match"
-                        : "expectations, but none matched")
-         << ":\n";
-    for (size_t i = 0; i < count; i++) {
-      TypedExpectation<F>* const expectation =
-          static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
-      *why << "\n";
-      expectation->DescribeLocationTo(why);
-      if (count > 1) {
-        *why << "tried expectation #" << i << ": ";
-      }
-      *why << expectation->source_text() << "...\n";
-      expectation->ExplainMatchResultTo(args, why);
-      expectation->DescribeCallCountTo(why);
-    }
-  }
-
-  // Performs the given action (or the default if it's null) with the given
-  // arguments and returns the action's result.
-  // L = *
-  R PerformAction(const void* untyped_action, ArgumentTuple&& args,
-                  const std::string& call_description) const {
-    if (untyped_action == nullptr) {
-      return PerformDefaultAction(std::move(args), call_description);
-    }
-
-    // Make a copy of the action before performing it, in case the
-    // action deletes the mock object (and thus deletes itself).
-    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
-    return action.Perform(std::move(args));
-  }
-
-  // Is it possible to store an object of the supplied type in a local variable
-  // for the sake of printing it, then return it on to the caller?
-  template <typename T>
-  using can_print_result = internal::conjunction<
-      // void can't be stored as an object (and we also don't need to print it).
-      internal::negation<std::is_void<T>>,
-      // Non-moveable types can't be returned on to the user, so there's no way
-      // for us to intercept and print them.
-      std::is_move_constructible<T>>;
-
-  // Perform the supplied action, printing the result to os.
-  template <typename T = R,
-            typename std::enable_if<can_print_result<T>::value, int>::type = 0>
-  R PerformActionAndPrintResult(const void* const untyped_action,
-                                ArgumentTuple&& args,
-                                const std::string& call_description,
-                                std::ostream& os) {
-    R result = PerformAction(untyped_action, std::move(args), call_description);
-
-    PrintAsActionResult(result, os);
-    return std::forward<R>(result);
-  }
-
-  // An overload for when it's not possible to print the result. In this case we
-  // simply perform the action.
-  template <typename T = R,
-            typename std::enable_if<
-                internal::negation<can_print_result<T>>::value, int>::type = 0>
-  R PerformActionAndPrintResult(const void* const untyped_action,
-                                ArgumentTuple&& args,
-                                const std::string& call_description,
-                                std::ostream&) {
-    return PerformAction(untyped_action, std::move(args), call_description);
-  }
-
-  // Returns the result of invoking this mock function with the given
-  // arguments. This function can be safely called from multiple
-  // threads concurrently.
-  R InvokeWith(ArgumentTuple&& args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
-};  // class FunctionMocker
-
-// Calculates the result of invoking this mock function with the given
-// arguments, prints it, and returns it.
-template <typename R, typename... Args>
-R FunctionMocker<R(Args...)>::InvokeWith(ArgumentTuple&& args)
-    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  // See the definition of untyped_expectations_ for why access to it
-  // is unprotected here.
-  if (untyped_expectations_.size() == 0) {
-    // No expectation is set on this mock method - we have an
-    // uninteresting call.
-
-    // We must get Google Mock's reaction on uninteresting calls
-    // made on this mock object BEFORE performing the action,
-    // because the action may DELETE the mock object and make the
-    // following expression meaningless.
-    const CallReaction reaction =
-        Mock::GetReactionOnUninterestingCalls(MockObject());
-
-    // True if and only if we need to print this call's arguments and return
-    // value.  This definition must be kept in sync with
-    // the behavior of ReportUninterestingCall().
-    const bool need_to_report_uninteresting_call =
-        // If the user allows this uninteresting call, we print it
-        // only when they want informational messages.
-        reaction == kAllow ? LogIsVisible(kInfo) :
-                           // If the user wants this to be a warning, we print
-                           // it only when they want to see warnings.
-            reaction == kWarn
-            ? LogIsVisible(kWarning)
-            :
-            // Otherwise, the user wants this to be an error, and we
-            // should always print detailed information in the error.
-            true;
-
-    if (!need_to_report_uninteresting_call) {
-      // Perform the action without printing the call information.
-      return this->PerformDefaultAction(
-          std::move(args), "Function call: " + std::string(Name()));
-    }
-
-    // Warns about the uninteresting call.
-    ::std::stringstream ss;
-    this->UntypedDescribeUninterestingCall(&args, &ss);
-
-    // Perform the action, print the result, and then report the uninteresting
-    // call.
-    //
-    // We use RAII to do the latter in case R is void or a non-moveable type. In
-    // either case we can't assign it to a local variable.
-    const Cleanup report_uninteresting_call(
-        [&] { ReportUninterestingCall(reaction, ss.str()); });
-
-    return PerformActionAndPrintResult(nullptr, std::move(args), ss.str(), ss);
-  }
-
-  bool is_excessive = false;
-  ::std::stringstream ss;
-  ::std::stringstream why;
-  ::std::stringstream loc;
-  const void* untyped_action = nullptr;
-
-  // The UntypedFindMatchingExpectation() function acquires and
-  // releases g_gmock_mutex.
-
-  const ExpectationBase* const untyped_expectation =
-      this->UntypedFindMatchingExpectation(&args, &untyped_action,
-                                           &is_excessive, &ss, &why);
-  const bool found = untyped_expectation != nullptr;
-
-  // True if and only if we need to print the call's arguments
-  // and return value.
-  // This definition must be kept in sync with the uses of Expect()
-  // and Log() in this function.
-  const bool need_to_report_call =
-      !found || is_excessive || LogIsVisible(kInfo);
-  if (!need_to_report_call) {
-    // Perform the action without printing the call information.
-    return PerformAction(untyped_action, std::move(args), "");
-  }
-
-  ss << "    Function call: " << Name();
-  this->UntypedPrintArgs(&args, &ss);
-
-  // In case the action deletes a piece of the expectation, we
-  // generate the message beforehand.
-  if (found && !is_excessive) {
-    untyped_expectation->DescribeLocationTo(&loc);
-  }
-
-  // Perform the action, print the result, and then fail or log in whatever way
-  // is appropriate.
-  //
-  // We use RAII to do the latter in case R is void or a non-moveable type. In
-  // either case we can't assign it to a local variable.
-  const Cleanup handle_failures([&] {
-    ss << "\n" << why.str();
-
-    if (!found) {
-      // No expectation matches this call - reports a failure.
-      Expect(false, nullptr, -1, ss.str());
-    } else if (is_excessive) {
-      // We had an upper-bound violation and the failure message is in ss.
-      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
-             ss.str());
-    } else {
-      // We had an expected call and the matching expectation is
-      // described in ss.
-      Log(kInfo, loc.str() + ss.str(), 2);
-    }
-  });
-
-  return PerformActionAndPrintResult(untyped_action, std::move(args), ss.str(),
-                                     ss);
-}
-
-}  // namespace internal
-
-namespace internal {
-
-template <typename F>
-class MockFunction;
-
-template <typename R, typename... Args>
-class MockFunction<R(Args...)> {
- public:
-  MockFunction(const MockFunction&) = delete;
-  MockFunction& operator=(const MockFunction&) = delete;
-
-  std::function<R(Args...)> AsStdFunction() {
-    return [this](Args... args) -> R {
-      return this->Call(std::forward<Args>(args)...);
-    };
-  }
-
-  // Implementation detail: the expansion of the MOCK_METHOD macro.
-  R Call(Args... args) {
-    mock_.SetOwnerAndName(this, "Call");
-    return mock_.Invoke(std::forward<Args>(args)...);
-  }
-
-  MockSpec<R(Args...)> gmock_Call(Matcher<Args>... m) {
-    mock_.RegisterOwner(this);
-    return mock_.With(std::move(m)...);
-  }
-
-  MockSpec<R(Args...)> gmock_Call(const WithoutMatchers&, R (*)(Args...)) {
-    return this->gmock_Call(::testing::A<Args>()...);
-  }
-
- protected:
-  MockFunction() = default;
-  ~MockFunction() = default;
-
- private:
-  FunctionMocker<R(Args...)> mock_;
-};
-
-/*
-The SignatureOf<F> struct is a meta-function returning function signature
-corresponding to the provided F argument.
-
-It makes use of MockFunction easier by allowing it to accept more F arguments
-than just function signatures.
-
-Specializations provided here cover a signature type itself and any template
-that can be parameterized with a signature, including std::function and
-boost::function.
-*/
-
-template <typename F, typename = void>
-struct SignatureOf;
-
-template <typename R, typename... Args>
-struct SignatureOf<R(Args...)> {
-  using type = R(Args...);
-};
-
-template <template <typename> class C, typename F>
-struct SignatureOf<C<F>,
-                   typename std::enable_if<std::is_function<F>::value>::type>
-    : SignatureOf<F> {};
-
-template <typename F>
-using SignatureOfT = typename SignatureOf<F>::type;
-
-}  // namespace internal
-
-// A MockFunction<F> type has one mock method whose type is
-// internal::SignatureOfT<F>.  It is useful when you just want your
-// test code to emit some messages and have Google Mock verify the
-// right messages are sent (and perhaps at the right times).  For
-// example, if you are exercising code:
-//
-//   Foo(1);
-//   Foo(2);
-//   Foo(3);
-//
-// and want to verify that Foo(1) and Foo(3) both invoke
-// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
-//
-// TEST(FooTest, InvokesBarCorrectly) {
-//   MyMock mock;
-//   MockFunction<void(string check_point_name)> check;
-//   {
-//     InSequence s;
-//
-//     EXPECT_CALL(mock, Bar("a"));
-//     EXPECT_CALL(check, Call("1"));
-//     EXPECT_CALL(check, Call("2"));
-//     EXPECT_CALL(mock, Bar("a"));
-//   }
-//   Foo(1);
-//   check.Call("1");
-//   Foo(2);
-//   check.Call("2");
-//   Foo(3);
-// }
-//
-// The expectation spec says that the first Bar("a") must happen
-// before check point "1", the second Bar("a") must happen after check
-// point "2", and nothing should happen between the two check
-// points. The explicit check points make it easy to tell which
-// Bar("a") is called by which call to Foo().
-//
-// MockFunction<F> can also be used to exercise code that accepts
-// std::function<internal::SignatureOfT<F>> callbacks. To do so, use
-// AsStdFunction() method to create std::function proxy forwarding to
-// original object's Call. Example:
-//
-// TEST(FooTest, RunsCallbackWithBarArgument) {
-//   MockFunction<int(string)> callback;
-//   EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
-//   Foo(callback.AsStdFunction());
-// }
-//
-// The internal::SignatureOfT<F> indirection allows to use other types
-// than just function signature type. This is typically useful when
-// providing a mock for a predefined std::function type. Example:
-//
-// using FilterPredicate = std::function<bool(string)>;
-// void MyFilterAlgorithm(FilterPredicate predicate);
-//
-// TEST(FooTest, FilterPredicateAlwaysAccepts) {
-//   MockFunction<FilterPredicate> predicateMock;
-//   EXPECT_CALL(predicateMock, Call(_)).WillRepeatedly(Return(true));
-//   MyFilterAlgorithm(predicateMock.AsStdFunction());
-// }
-template <typename F>
-class MockFunction : public internal::MockFunction<internal::SignatureOfT<F>> {
-  using Base = internal::MockFunction<internal::SignatureOfT<F>>;
-
- public:
-  using Base::Base;
-};
-
-// The style guide prohibits "using" statements in a namespace scope
-// inside a header file.  However, the MockSpec class template is
-// meant to be defined in the ::testing namespace.  The following line
-// is just a trick for working around a bug in MSVC 8.0, which cannot
-// handle it if we define MockSpec in ::testing.
-using internal::MockSpec;
-
-// Const(x) is a convenient function for obtaining a const reference
-// to x.  This is useful for setting expectations on an overloaded
-// const mock method, e.g.
-//
-//   class MockFoo : public FooInterface {
-//    public:
-//     MOCK_METHOD0(Bar, int());
-//     MOCK_CONST_METHOD0(Bar, int&());
-//   };
-//
-//   MockFoo foo;
-//   // Expects a call to non-const MockFoo::Bar().
-//   EXPECT_CALL(foo, Bar());
-//   // Expects a call to const MockFoo::Bar().
-//   EXPECT_CALL(Const(foo), Bar());
-template <typename T>
-inline const T& Const(const T& x) {
-  return x;
-}
-
-// Constructs an Expectation object that references and co-owns exp.
-inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
-    : expectation_base_(exp.GetHandle().expectation_base()) {}
-
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-// Implementation for ON_CALL and EXPECT_CALL macros. A separate macro is
-// required to avoid compile errors when the name of the method used in call is
-// a result of macro expansion. See CompilesWithMethodNameExpandedFromMacro
-// tests in internal/gmock-spec-builders_test.cc for more details.
-//
-// This macro supports statements both with and without parameter matchers. If
-// the parameter list is omitted, gMock will accept any parameters, which allows
-// tests to be written that don't need to encode the number of method
-// parameter. This technique may only be used for non-overloaded methods.
-//
-//   // These are the same:
-//   ON_CALL(mock, NoArgsMethod()).WillByDefault(...);
-//   ON_CALL(mock, NoArgsMethod).WillByDefault(...);
-//
-//   // As are these:
-//   ON_CALL(mock, TwoArgsMethod(_, _)).WillByDefault(...);
-//   ON_CALL(mock, TwoArgsMethod).WillByDefault(...);
-//
-//   // Can also specify args if you want, of course:
-//   ON_CALL(mock, TwoArgsMethod(_, 45)).WillByDefault(...);
-//
-//   // Overloads work as long as you specify parameters:
-//   ON_CALL(mock, OverloadedMethod(_)).WillByDefault(...);
-//   ON_CALL(mock, OverloadedMethod(_, _)).WillByDefault(...);
-//
-//   // Oops! Which overload did you want?
-//   ON_CALL(mock, OverloadedMethod).WillByDefault(...);
-//     => ERROR: call to member function 'gmock_OverloadedMethod' is ambiguous
-//
-// How this works: The mock class uses two overloads of the gmock_Method
-// expectation setter method plus an operator() overload on the MockSpec object.
-// In the matcher list form, the macro expands to:
-//
-//   // This statement:
-//   ON_CALL(mock, TwoArgsMethod(_, 45))...
-//
-//   // ...expands to:
-//   mock.gmock_TwoArgsMethod(_, 45)(WithoutMatchers(), nullptr)...
-//   |-------------v---------------||------------v-------------|
-//       invokes first overload        swallowed by operator()
-//
-//   // ...which is essentially:
-//   mock.gmock_TwoArgsMethod(_, 45)...
-//
-// Whereas the form without a matcher list:
-//
-//   // This statement:
-//   ON_CALL(mock, TwoArgsMethod)...
-//
-//   // ...expands to:
-//   mock.gmock_TwoArgsMethod(WithoutMatchers(), nullptr)...
-//   |-----------------------v--------------------------|
-//                 invokes second overload
-//
-//   // ...which is essentially:
-//   mock.gmock_TwoArgsMethod(_, _)...
-//
-// The WithoutMatchers() argument is used to disambiguate overloads and to
-// block the caller from accidentally invoking the second overload directly. The
-// second argument is an internal type derived from the method signature. The
-// failure to disambiguate two overloads of this method in the ON_CALL statement
-// is how we block callers from setting expectations on overloaded methods.
-#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call)                    \
-  ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
-                             nullptr)                                   \
-      .Setter(__FILE__, __LINE__, #mock_expr, #call)
-
-#define ON_CALL(obj, call) \
-  GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
-
-#define EXPECT_CALL(obj, call) \
-  GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/gmock.h b/third_party/googletest/src/googlemock/include/gmock/gmock.h
deleted file mode 100644
index 568c8c71d7..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/gmock.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This is the main header file a user should include.
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
-
-// This file implements the following syntax:
-//
-//   ON_CALL(mock_object, Method(...))
-//     .With(...) ?
-//     .WillByDefault(...);
-//
-// where With() is optional and WillByDefault() must appear exactly
-// once.
-//
-//   EXPECT_CALL(mock_object, Method(...))
-//     .With(...) ?
-//     .Times(...) ?
-//     .InSequence(...) *
-//     .WillOnce(...) *
-//     .WillRepeatedly(...) ?
-//     .RetiresOnSaturation() ? ;
-//
-// where all clauses are optional and WillOnce() can be repeated.
-
-#include "gmock/gmock-actions.h"
-#include "gmock/gmock-cardinalities.h"
-#include "gmock/gmock-function-mocker.h"
-#include "gmock/gmock-matchers.h"
-#include "gmock/gmock-more-actions.h"
-#include "gmock/gmock-more-matchers.h"
-#include "gmock/gmock-nice-strict.h"
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gmock/internal/gmock-port.h"
-
-// Declares Google Mock flags that we want a user to use programmatically.
-GMOCK_DECLARE_bool_(catch_leaked_mocks);
-GMOCK_DECLARE_string_(verbose);
-GMOCK_DECLARE_int32_(default_mock_behavior);
-
-namespace testing {
-
-// Initializes Google Mock.  This must be called before running the
-// tests.  In particular, it parses the command line for the flags
-// that Google Mock recognizes.  Whenever a Google Mock flag is seen,
-// it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Mock flag variables are
-// updated.
-//
-// Since Google Test is needed for Google Mock to work, this function
-// also initializes Google Test and parses its flags, if that hasn't
-// been done.
-GTEST_API_ void InitGoogleMock(int* argc, char** argv);
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
-
-// This overloaded version can be used on Arduino/embedded platforms where
-// there is no argc/argv.
-GTEST_API_ void InitGoogleMock();
-
-}  // namespace testing
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md b/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
deleted file mode 100644
index 9c4874fd0c..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Customization Points
-
-The custom directory is an injection point for custom user configurations.
-
-## Header `gmock-port.h`
-
-The following macros can be defined:
-
-### Flag related macros:
-
-*   `GMOCK_DECLARE_bool_(name)`
-*   `GMOCK_DECLARE_int32_(name)`
-*   `GMOCK_DECLARE_string_(name)`
-*   `GMOCK_DEFINE_bool_(name, default_val, doc)`
-*   `GMOCK_DEFINE_int32_(name, default_val, doc)`
-*   `GMOCK_DEFINE_string_(name, default_val, doc)`
-*   `GMOCK_FLAG_GET(flag_name)`
-*   `GMOCK_FLAG_SET(flag_name, value)`
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
deleted file mode 100644
index bbcad31c76..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
deleted file mode 100644
index bb7dcbaa4c..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Injection point for custom user configurations. See README for details
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h b/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
deleted file mode 100644
index f055f7506b..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Injection point for custom user configurations. See README for details
-//
-// ** Custom implementation starts here **
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
deleted file mode 100644
index b1343fdc82..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file defines some utilities useful for implementing Google
-// Mock.  They are subject to change without notice, so please DO NOT
-// USE THEM IN USER CODE.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
-
-#include <stdio.h>
-
-#include <ostream>  // NOLINT
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "gmock/internal/gmock-port.h"
-#include "gtest/gtest.h"
-
-namespace testing {
-
-template <typename>
-class Matcher;
-
-namespace internal {
-
-// Silence MSVC C4100 (unreferenced formal parameter) and
-// C4805('==': unsafe mix of type 'const int' and type 'const bool')
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#pragma warning(disable : 4805)
-#endif
-
-// Joins a vector of strings as if they are fields of a tuple; returns
-// the joined string.
-GTEST_API_ std::string JoinAsKeyValueTuple(
-    const std::vector<const char*>& names, const Strings& values);
-
-// Converts an identifier name to a space-separated list of lower-case
-// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
-// treated as one word.  For example, both "FooBar123" and
-// "foo_bar_123" are converted to "foo bar 123".
-GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name);
-
-// GetRawPointer(p) returns the raw pointer underlying p when p is a
-// smart pointer, or returns p itself when p is already a raw pointer.
-// The following default implementation is for the smart pointer case.
-template <typename Pointer>
-inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
-  return p.get();
-}
-// This overload version is for std::reference_wrapper, which does not work with
-// the overload above, as it does not have an `element_type`.
-template <typename Element>
-inline const Element* GetRawPointer(const std::reference_wrapper<Element>& r) {
-  return &r.get();
-}
-
-// This overloaded version is for the raw pointer case.
-template <typename Element>
-inline Element* GetRawPointer(Element* p) {
-  return p;
-}
-
-// MSVC treats wchar_t as a native type usually, but treats it as the
-// same as unsigned short when the compiler option /Zc:wchar_t- is
-// specified.  It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
-// is a native type.
-#if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
-// wchar_t is a typedef.
-#else
-#define GMOCK_WCHAR_T_IS_NATIVE_ 1
-#endif
-
-// In what follows, we use the term "kind" to indicate whether a type
-// is bool, an integer type (excluding bool), a floating-point type,
-// or none of them.  This categorization is useful for determining
-// when a matcher argument type can be safely converted to another
-// type in the implementation of SafeMatcherCast.
-enum TypeKind { kBool, kInteger, kFloatingPoint, kOther };
-
-// KindOf<T>::value is the kind of type T.
-template <typename T>
-struct KindOf {
-  enum { value = kOther };  // The default kind.
-};
-
-// This macro declares that the kind of 'type' is 'kind'.
-#define GMOCK_DECLARE_KIND_(type, kind) \
-  template <>                           \
-  struct KindOf<type> {                 \
-    enum { value = kind };              \
-  }
-
-GMOCK_DECLARE_KIND_(bool, kBool);
-
-// All standard integer types.
-GMOCK_DECLARE_KIND_(char, kInteger);
-GMOCK_DECLARE_KIND_(signed char, kInteger);
-GMOCK_DECLARE_KIND_(unsigned char, kInteger);
-GMOCK_DECLARE_KIND_(short, kInteger);           // NOLINT
-GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
-GMOCK_DECLARE_KIND_(int, kInteger);
-GMOCK_DECLARE_KIND_(unsigned int, kInteger);
-GMOCK_DECLARE_KIND_(long, kInteger);                // NOLINT
-GMOCK_DECLARE_KIND_(unsigned long, kInteger);       // NOLINT
-GMOCK_DECLARE_KIND_(long long, kInteger);           // NOLINT
-GMOCK_DECLARE_KIND_(unsigned long long, kInteger);  // NOLINT
-
-#if GMOCK_WCHAR_T_IS_NATIVE_
-GMOCK_DECLARE_KIND_(wchar_t, kInteger);
-#endif
-
-// All standard floating-point types.
-GMOCK_DECLARE_KIND_(float, kFloatingPoint);
-GMOCK_DECLARE_KIND_(double, kFloatingPoint);
-GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
-
-#undef GMOCK_DECLARE_KIND_
-
-// Evaluates to the kind of 'type'.
-#define GMOCK_KIND_OF_(type)                   \
-  static_cast< ::testing::internal::TypeKind>( \
-      ::testing::internal::KindOf<type>::value)
-
-// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
-// is true if and only if arithmetic type From can be losslessly converted to
-// arithmetic type To.
-//
-// It's the user's responsibility to ensure that both From and To are
-// raw (i.e. has no CV modifier, is not a pointer, and is not a
-// reference) built-in arithmetic types, kFromKind is the kind of
-// From, and kToKind is the kind of To; the value is
-// implementation-defined when the above pre-condition is violated.
-template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
-using LosslessArithmeticConvertibleImpl = std::integral_constant<
-    bool,
-    // clang-format off
-      // Converting from bool is always lossless
-      (kFromKind == kBool) ? true
-      // Converting between any other type kinds will be lossy if the type
-      // kinds are not the same.
-    : (kFromKind != kToKind) ? false
-    : (kFromKind == kInteger &&
-       // Converting between integers of different widths is allowed so long
-       // as the conversion does not go from signed to unsigned.
-      (((sizeof(From) < sizeof(To)) &&
-        !(std::is_signed<From>::value && !std::is_signed<To>::value)) ||
-       // Converting between integers of the same width only requires the
-       // two types to have the same signedness.
-       ((sizeof(From) == sizeof(To)) &&
-        (std::is_signed<From>::value == std::is_signed<To>::value)))
-       ) ? true
-      // Floating point conversions are lossless if and only if `To` is at least
-      // as wide as `From`.
-    : (kFromKind == kFloatingPoint && (sizeof(From) <= sizeof(To))) ? true
-    : false
-    // clang-format on
-    >;
-
-// LosslessArithmeticConvertible<From, To>::value is true if and only if
-// arithmetic type From can be losslessly converted to arithmetic type To.
-//
-// It's the user's responsibility to ensure that both From and To are
-// raw (i.e. has no CV modifier, is not a pointer, and is not a
-// reference) built-in arithmetic types; the value is
-// implementation-defined when the above pre-condition is violated.
-template <typename From, typename To>
-using LosslessArithmeticConvertible =
-    LosslessArithmeticConvertibleImpl<GMOCK_KIND_OF_(From), From,
-                                      GMOCK_KIND_OF_(To), To>;
-
-// This interface knows how to report a Google Mock failure (either
-// non-fatal or fatal).
-class FailureReporterInterface {
- public:
-  // The type of a failure (either non-fatal or fatal).
-  enum FailureType { kNonfatal, kFatal };
-
-  virtual ~FailureReporterInterface() {}
-
-  // Reports a failure that occurred at the given source file location.
-  virtual void ReportFailure(FailureType type, const char* file, int line,
-                             const std::string& message) = 0;
-};
-
-// Returns the failure reporter used by Google Mock.
-GTEST_API_ FailureReporterInterface* GetFailureReporter();
-
-// Asserts that condition is true; aborts the process with the given
-// message if condition is false.  We cannot use LOG(FATAL) or CHECK()
-// as Google Mock might be used to mock the log sink itself.  We
-// inline this function to prevent it from showing up in the stack
-// trace.
-inline void Assert(bool condition, const char* file, int line,
-                   const std::string& msg) {
-  if (!condition) {
-    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal, file,
-                                        line, msg);
-  }
-}
-inline void Assert(bool condition, const char* file, int line) {
-  Assert(condition, file, line, "Assertion failed.");
-}
-
-// Verifies that condition is true; generates a non-fatal failure if
-// condition is false.
-inline void Expect(bool condition, const char* file, int line,
-                   const std::string& msg) {
-  if (!condition) {
-    GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
-                                        file, line, msg);
-  }
-}
-inline void Expect(bool condition, const char* file, int line) {
-  Expect(condition, file, line, "Expectation failed.");
-}
-
-// Severity level of a log.
-enum LogSeverity { kInfo = 0, kWarning = 1 };
-
-// Valid values for the --gmock_verbose flag.
-
-// All logs (informational and warnings) are printed.
-const char kInfoVerbosity[] = "info";
-// Only warnings are printed.
-const char kWarningVerbosity[] = "warning";
-// No logs are printed.
-const char kErrorVerbosity[] = "error";
-
-// Returns true if and only if a log with the given severity is visible
-// according to the --gmock_verbose flag.
-GTEST_API_ bool LogIsVisible(LogSeverity severity);
-
-// Prints the given message to stdout if and only if 'severity' >= the level
-// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
-// 0, also prints the stack trace excluding the top
-// stack_frames_to_skip frames.  In opt mode, any positive
-// stack_frames_to_skip is treated as 0, since we don't know which
-// function calls will be inlined by the compiler and need to be
-// conservative.
-GTEST_API_ void Log(LogSeverity severity, const std::string& message,
-                    int stack_frames_to_skip);
-
-// A marker class that is used to resolve parameterless expectations to the
-// correct overload. This must not be instantiable, to prevent client code from
-// accidentally resolving to the overload; for example:
-//
-//    ON_CALL(mock, Method({}, nullptr))...
-//
-class WithoutMatchers {
- private:
-  WithoutMatchers() {}
-  friend GTEST_API_ WithoutMatchers GetWithoutMatchers();
-};
-
-// Internal use only: access the singleton instance of WithoutMatchers.
-GTEST_API_ WithoutMatchers GetWithoutMatchers();
-
-// Disable MSVC warnings for infinite recursion, since in this case the
-// recursion is unreachable.
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4717)
-#endif
-
-// Invalid<T>() is usable as an expression of type T, but will terminate
-// the program with an assertion failure if actually run.  This is useful
-// when a value of type T is needed for compilation, but the statement
-// will not really be executed (or we don't care if the statement
-// crashes).
-template <typename T>
-inline T Invalid() {
-  Assert(false, "", -1, "Internal error: attempt to return invalid value");
-#if defined(__GNUC__) || defined(__clang__)
-  __builtin_unreachable();
-#elif defined(_MSC_VER)
-  __assume(0);
-#else
-  return Invalid<T>();
-#endif
-}
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-// Given a raw type (i.e. having no top-level reference or const
-// modifier) RawContainer that's either an STL-style container or a
-// native array, class StlContainerView<RawContainer> has the
-// following members:
-//
-//   - type is a type that provides an STL-style container view to
-//     (i.e. implements the STL container concept for) RawContainer;
-//   - const_reference is a type that provides a reference to a const
-//     RawContainer;
-//   - ConstReference(raw_container) returns a const reference to an STL-style
-//     container view to raw_container, which is a RawContainer.
-//   - Copy(raw_container) returns an STL-style container view of a
-//     copy of raw_container, which is a RawContainer.
-//
-// This generic version is used when RawContainer itself is already an
-// STL-style container.
-template <class RawContainer>
-class StlContainerView {
- public:
-  typedef RawContainer type;
-  typedef const type& const_reference;
-
-  static const_reference ConstReference(const RawContainer& container) {
-    static_assert(!std::is_const<RawContainer>::value,
-                  "RawContainer type must not be const");
-    return container;
-  }
-  static type Copy(const RawContainer& container) { return container; }
-};
-
-// This specialization is used when RawContainer is a native array type.
-template <typename Element, size_t N>
-class StlContainerView<Element[N]> {
- public:
-  typedef typename std::remove_const<Element>::type RawElement;
-  typedef internal::NativeArray<RawElement> type;
-  // NativeArray<T> can represent a native array either by value or by
-  // reference (selected by a constructor argument), so 'const type'
-  // can be used to reference a const native array.  We cannot
-  // 'typedef const type& const_reference' here, as that would mean
-  // ConstReference() has to return a reference to a local variable.
-  typedef const type const_reference;
-
-  static const_reference ConstReference(const Element (&array)[N]) {
-    static_assert(std::is_same<Element, RawElement>::value,
-                  "Element type must not be const");
-    return type(array, N, RelationToSourceReference());
-  }
-  static type Copy(const Element (&array)[N]) {
-    return type(array, N, RelationToSourceCopy());
-  }
-};
-
-// This specialization is used when RawContainer is a native array
-// represented as a (pointer, size) tuple.
-template <typename ElementPointer, typename Size>
-class StlContainerView< ::std::tuple<ElementPointer, Size> > {
- public:
-  typedef typename std::remove_const<
-      typename std::pointer_traits<ElementPointer>::element_type>::type
-      RawElement;
-  typedef internal::NativeArray<RawElement> type;
-  typedef const type const_reference;
-
-  static const_reference ConstReference(
-      const ::std::tuple<ElementPointer, Size>& array) {
-    return type(std::get<0>(array), std::get<1>(array),
-                RelationToSourceReference());
-  }
-  static type Copy(const ::std::tuple<ElementPointer, Size>& array) {
-    return type(std::get<0>(array), std::get<1>(array), RelationToSourceCopy());
-  }
-};
-
-// The following specialization prevents the user from instantiating
-// StlContainer with a reference type.
-template <typename T>
-class StlContainerView<T&>;
-
-// A type transform to remove constness from the first part of a pair.
-// Pairs like that are used as the value_type of associative containers,
-// and this transform produces a similar but assignable pair.
-template <typename T>
-struct RemoveConstFromKey {
-  typedef T type;
-};
-
-// Partially specialized to remove constness from std::pair<const K, V>.
-template <typename K, typename V>
-struct RemoveConstFromKey<std::pair<const K, V> > {
-  typedef std::pair<K, V> type;
-};
-
-// Emit an assertion failure due to incorrect DoDefault() usage. Out-of-lined to
-// reduce code size.
-GTEST_API_ void IllegalDoDefault(const char* file, int line);
-
-template <typename F, typename Tuple, size_t... Idx>
-auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>)
-    -> decltype(std::forward<F>(f)(
-        std::get<Idx>(std::forward<Tuple>(args))...)) {
-  return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
-}
-
-// Apply the function to a tuple of arguments.
-template <typename F, typename Tuple>
-auto Apply(F&& f, Tuple&& args) -> decltype(ApplyImpl(
-    std::forward<F>(f), std::forward<Tuple>(args),
-    MakeIndexSequence<std::tuple_size<
-        typename std::remove_reference<Tuple>::type>::value>())) {
-  return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
-                   MakeIndexSequence<std::tuple_size<
-                       typename std::remove_reference<Tuple>::type>::value>());
-}
-
-// Template struct Function<F>, where F must be a function type, contains
-// the following typedefs:
-//
-//   Result:               the function's return type.
-//   Arg<N>:               the type of the N-th argument, where N starts with 0.
-//   ArgumentTuple:        the tuple type consisting of all parameters of F.
-//   ArgumentMatcherTuple: the tuple type consisting of Matchers for all
-//                         parameters of F.
-//   MakeResultVoid:       the function type obtained by substituting void
-//                         for the return type of F.
-//   MakeResultIgnoredValue:
-//                         the function type obtained by substituting Something
-//                         for the return type of F.
-template <typename T>
-struct Function;
-
-template <typename R, typename... Args>
-struct Function<R(Args...)> {
-  using Result = R;
-  static constexpr size_t ArgumentCount = sizeof...(Args);
-  template <size_t I>
-  using Arg = ElemFromList<I, Args...>;
-  using ArgumentTuple = std::tuple<Args...>;
-  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
-  using MakeResultVoid = void(Args...);
-  using MakeResultIgnoredValue = IgnoredValue(Args...);
-};
-
-template <typename R, typename... Args>
-constexpr size_t Function<R(Args...)>::ArgumentCount;
-
-bool Base64Unescape(const std::string& encoded, std::string* decoded);
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
deleted file mode 100644
index bc18a25f34..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Low-level types and utilities for porting Google Mock to various
-// platforms.  All macros ending with _ and symbols defined in an
-// internal namespace are subject to change without notice.  Code
-// outside Google Mock MUST NOT USE THEM DIRECTLY.  Macros that don't
-// end with _ are part of Google Mock's public API and can be used by
-// code outside Google Mock.
-
-// IWYU pragma: private, include "gmock/gmock.h"
-// IWYU pragma: friend gmock/.*
-
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
-
-#include <assert.h>
-#include <stdlib.h>
-#include <cstdint>
-#include <iostream>
-
-// Most of the utilities needed for porting Google Mock are also
-// required for Google Test and are defined in gtest-port.h.
-//
-// Note to maintainers: to reduce code duplication, prefer adding
-// portability utilities to Google Test's gtest-port.h instead of
-// here, as Google Mock depends on Google Test.  Only add a utility
-// here if it's truly specific to Google Mock.
-
-#include "gmock/internal/custom/gmock-port.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_HAS_ABSL
-#include "absl/flags/declare.h"
-#include "absl/flags/flag.h"
-#endif
-
-// For MS Visual C++, check the compiler version. At least VS 2015 is
-// required to compile Google Mock.
-#if defined(_MSC_VER) && _MSC_VER < 1900
-#error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
-#endif
-
-// Macro for referencing flags.  This is public as we want the user to
-// use this syntax to reference Google Mock flags.
-#define GMOCK_FLAG_NAME_(name) gmock_##name
-#define GMOCK_FLAG(name) FLAGS_gmock_##name
-
-// Pick a command line flags implementation.
-#if GTEST_HAS_ABSL
-
-// Macros for defining flags.
-#define GMOCK_DEFINE_bool_(name, default_val, doc) \
-  ABSL_FLAG(bool, GMOCK_FLAG_NAME_(name), default_val, doc)
-#define GMOCK_DEFINE_int32_(name, default_val, doc) \
-  ABSL_FLAG(int32_t, GMOCK_FLAG_NAME_(name), default_val, doc)
-#define GMOCK_DEFINE_string_(name, default_val, doc) \
-  ABSL_FLAG(std::string, GMOCK_FLAG_NAME_(name), default_val, doc)
-
-// Macros for declaring flags.
-#define GMOCK_DECLARE_bool_(name) \
-  ABSL_DECLARE_FLAG(bool, GMOCK_FLAG_NAME_(name))
-#define GMOCK_DECLARE_int32_(name) \
-  ABSL_DECLARE_FLAG(int32_t, GMOCK_FLAG_NAME_(name))
-#define GMOCK_DECLARE_string_(name) \
-  ABSL_DECLARE_FLAG(std::string, GMOCK_FLAG_NAME_(name))
-
-#define GMOCK_FLAG_GET(name) ::absl::GetFlag(GMOCK_FLAG(name))
-#define GMOCK_FLAG_SET(name, value) \
-  (void)(::absl::SetFlag(&GMOCK_FLAG(name), value))
-
-#else  // GTEST_HAS_ABSL
-
-// Macros for defining flags.
-#define GMOCK_DEFINE_bool_(name, default_val, doc)  \
-  namespace testing {                               \
-  GTEST_API_ bool GMOCK_FLAG(name) = (default_val); \
-  }                                                 \
-  static_assert(true, "no-op to require trailing semicolon")
-#define GMOCK_DEFINE_int32_(name, default_val, doc)    \
-  namespace testing {                                  \
-  GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val); \
-  }                                                    \
-  static_assert(true, "no-op to require trailing semicolon")
-#define GMOCK_DEFINE_string_(name, default_val, doc)         \
-  namespace testing {                                        \
-  GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val); \
-  }                                                          \
-  static_assert(true, "no-op to require trailing semicolon")
-
-// Macros for declaring flags.
-#define GMOCK_DECLARE_bool_(name)          \
-  namespace testing {                      \
-  GTEST_API_ extern bool GMOCK_FLAG(name); \
-  }                                        \
-  static_assert(true, "no-op to require trailing semicolon")
-#define GMOCK_DECLARE_int32_(name)            \
-  namespace testing {                         \
-  GTEST_API_ extern int32_t GMOCK_FLAG(name); \
-  }                                           \
-  static_assert(true, "no-op to require trailing semicolon")
-#define GMOCK_DECLARE_string_(name)                 \
-  namespace testing {                               \
-  GTEST_API_ extern ::std::string GMOCK_FLAG(name); \
-  }                                                 \
-  static_assert(true, "no-op to require trailing semicolon")
-
-#define GMOCK_FLAG_GET(name) ::testing::GMOCK_FLAG(name)
-#define GMOCK_FLAG_SET(name, value) (void)(::testing::GMOCK_FLAG(name) = value)
-
-#endif  // GTEST_HAS_ABSL
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h b/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
deleted file mode 100644
index 94d61c09c8..0000000000
--- a/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
+++ /dev/null
@@ -1,279 +0,0 @@
-#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
-#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
-
-// Expands and concatenates the arguments. Constructed macros reevaluate.
-#define GMOCK_PP_CAT(_1, _2) GMOCK_PP_INTERNAL_CAT(_1, _2)
-
-// Expands and stringifies the only argument.
-#define GMOCK_PP_STRINGIZE(...) GMOCK_PP_INTERNAL_STRINGIZE(__VA_ARGS__)
-
-// Returns empty. Given a variadic number of arguments.
-#define GMOCK_PP_EMPTY(...)
-
-// Returns a comma. Given a variadic number of arguments.
-#define GMOCK_PP_COMMA(...) ,
-
-// Returns the only argument.
-#define GMOCK_PP_IDENTITY(_1) _1
-
-// Evaluates to the number of arguments after expansion.
-//
-//   #define PAIR x, y
-//
-//   GMOCK_PP_NARG() => 1
-//   GMOCK_PP_NARG(x) => 1
-//   GMOCK_PP_NARG(x, y) => 2
-//   GMOCK_PP_NARG(PAIR) => 2
-//
-// Requires: the number of arguments after expansion is at most 15.
-#define GMOCK_PP_NARG(...) \
-  GMOCK_PP_INTERNAL_16TH(  \
-      (__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-
-// Returns 1 if the expansion of arguments has an unprotected comma. Otherwise
-// returns 0. Requires no more than 15 unprotected commas.
-#define GMOCK_PP_HAS_COMMA(...) \
-  GMOCK_PP_INTERNAL_16TH(       \
-      (__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0))
-
-// Returns the first argument.
-#define GMOCK_PP_HEAD(...) GMOCK_PP_INTERNAL_HEAD((__VA_ARGS__, unusedArg))
-
-// Returns the tail. A variadic list of all arguments minus the first. Requires
-// at least one argument.
-#define GMOCK_PP_TAIL(...) GMOCK_PP_INTERNAL_TAIL((__VA_ARGS__))
-
-// Calls CAT(_Macro, NARG(__VA_ARGS__))(__VA_ARGS__)
-#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
-  GMOCK_PP_IDENTITY(                        \
-      GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__))
-
-// If the arguments after expansion have no tokens, evaluates to `1`. Otherwise
-// evaluates to `0`.
-//
-// Requires: * the number of arguments after expansion is at most 15.
-//           * If the argument is a macro, it must be able to be called with one
-//             argument.
-//
-// Implementation details:
-//
-// There is one case when it generates a compile error: if the argument is macro
-// that cannot be called with one argument.
-//
-//   #define M(a, b)  // it doesn't matter what it expands to
-//
-//   // Expected: expands to `0`.
-//   // Actual: compile error.
-//   GMOCK_PP_IS_EMPTY(M)
-//
-// There are 4 cases tested:
-//
-// * __VA_ARGS__ possible expansion has no unparen'd commas. Expected 0.
-// * __VA_ARGS__ possible expansion is not enclosed in parenthesis. Expected 0.
-// * __VA_ARGS__ possible expansion is not a macro that ()-evaluates to a comma.
-//   Expected 0
-// * __VA_ARGS__ is empty, or has unparen'd commas, or is enclosed in
-//   parenthesis, or is a macro that ()-evaluates to comma. Expected 1.
-//
-// We trigger detection on '0001', i.e. on empty.
-#define GMOCK_PP_IS_EMPTY(...)                                               \
-  GMOCK_PP_INTERNAL_IS_EMPTY(GMOCK_PP_HAS_COMMA(__VA_ARGS__),                \
-                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__), \
-                             GMOCK_PP_HAS_COMMA(__VA_ARGS__()),              \
-                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__()))
-
-// Evaluates to _Then if _Cond is 1 and _Else if _Cond is 0.
-#define GMOCK_PP_IF(_Cond, _Then, _Else) \
-  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IF_, _Cond)(_Then, _Else)
-
-// Similar to GMOCK_PP_IF but takes _Then and _Else in parentheses.
-//
-// GMOCK_PP_GENERIC_IF(1, (a, b, c), (d, e, f)) => a, b, c
-// GMOCK_PP_GENERIC_IF(0, (a, b, c), (d, e, f)) => d, e, f
-//
-#define GMOCK_PP_GENERIC_IF(_Cond, _Then, _Else) \
-  GMOCK_PP_REMOVE_PARENS(GMOCK_PP_IF(_Cond, _Then, _Else))
-
-// Evaluates to the number of arguments after expansion. Identifies 'empty' as
-// 0.
-//
-//   #define PAIR x, y
-//
-//   GMOCK_PP_NARG0() => 0
-//   GMOCK_PP_NARG0(x) => 1
-//   GMOCK_PP_NARG0(x, y) => 2
-//   GMOCK_PP_NARG0(PAIR) => 2
-//
-// Requires: * the number of arguments after expansion is at most 15.
-//           * If the argument is a macro, it must be able to be called with one
-//             argument.
-#define GMOCK_PP_NARG0(...) \
-  GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(__VA_ARGS__), 0, GMOCK_PP_NARG(__VA_ARGS__))
-
-// Expands to 1 if the first argument starts with something in parentheses,
-// otherwise to 0.
-#define GMOCK_PP_IS_BEGIN_PARENS(...)                              \
-  GMOCK_PP_HEAD(GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_, \
-                             GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C __VA_ARGS__))
-
-// Expands to 1 is there is only one argument and it is enclosed in parentheses.
-#define GMOCK_PP_IS_ENCLOSED_PARENS(...)             \
-  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(__VA_ARGS__), \
-              GMOCK_PP_IS_EMPTY(GMOCK_PP_EMPTY __VA_ARGS__), 0)
-
-// Remove the parens, requires GMOCK_PP_IS_ENCLOSED_PARENS(args) => 1.
-#define GMOCK_PP_REMOVE_PARENS(...) GMOCK_PP_INTERNAL_REMOVE_PARENS __VA_ARGS__
-
-// Expands to _Macro(0, _Data, e1) _Macro(1, _Data, e2) ... _Macro(K -1, _Data,
-// eK) as many of GMOCK_INTERNAL_NARG0 _Tuple.
-// Requires: * |_Macro| can be called with 3 arguments.
-//           * |_Tuple| expansion has no more than 15 elements.
-#define GMOCK_PP_FOR_EACH(_Macro, _Data, _Tuple)                        \
-  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, GMOCK_PP_NARG0 _Tuple) \
-  (0, _Macro, _Data, _Tuple)
-
-// Expands to _Macro(0, _Data, ) _Macro(1, _Data, ) ... _Macro(K - 1, _Data, )
-// Empty if _K = 0.
-// Requires: * |_Macro| can be called with 3 arguments.
-//           * |_K| literal between 0 and 15
-#define GMOCK_PP_REPEAT(_Macro, _Data, _N)           \
-  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, _N) \
-  (0, _Macro, _Data, GMOCK_PP_INTENRAL_EMPTY_TUPLE)
-
-// Increments the argument, requires the argument to be between 0 and 15.
-#define GMOCK_PP_INC(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_INC_, _i)
-
-// Returns comma if _i != 0. Requires _i to be between 0 and 15.
-#define GMOCK_PP_COMMA_IF(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_COMMA_IF_, _i)
-
-// Internal details follow. Do not use any of these symbols outside of this
-// file or we will break your code.
-#define GMOCK_PP_INTENRAL_EMPTY_TUPLE (, , , , , , , , , , , , , , , )
-#define GMOCK_PP_INTERNAL_CAT(_1, _2) _1##_2
-#define GMOCK_PP_INTERNAL_STRINGIZE(...) #__VA_ARGS__
-#define GMOCK_PP_INTERNAL_CAT_5(_1, _2, _3, _4, _5) _1##_2##_3##_4##_5
-#define GMOCK_PP_INTERNAL_IS_EMPTY(_1, _2, _3, _4)                             \
-  GMOCK_PP_HAS_COMMA(GMOCK_PP_INTERNAL_CAT_5(GMOCK_PP_INTERNAL_IS_EMPTY_CASE_, \
-                                             _1, _2, _3, _4))
-#define GMOCK_PP_INTERNAL_IS_EMPTY_CASE_0001 ,
-#define GMOCK_PP_INTERNAL_IF_1(_Then, _Else) _Then
-#define GMOCK_PP_INTERNAL_IF_0(_Then, _Else) _Else
-
-// Because of MSVC treating a token with a comma in it as a single token when
-// passed to another macro, we need to force it to evaluate it as multiple
-// tokens. We do that by using a "IDENTITY(MACRO PARENTHESIZED_ARGS)" macro. We
-// define one per possible macro that relies on this behavior. Note "_Args" must
-// be parenthesized.
-#define GMOCK_PP_INTERNAL_INTERNAL_16TH(_1, _2, _3, _4, _5, _6, _7, _8, _9, \
-                                        _10, _11, _12, _13, _14, _15, _16,  \
-                                        ...)                                \
-  _16
-#define GMOCK_PP_INTERNAL_16TH(_Args) \
-  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_16TH _Args)
-#define GMOCK_PP_INTERNAL_INTERNAL_HEAD(_1, ...) _1
-#define GMOCK_PP_INTERNAL_HEAD(_Args) \
-  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_HEAD _Args)
-#define GMOCK_PP_INTERNAL_INTERNAL_TAIL(_1, ...) __VA_ARGS__
-#define GMOCK_PP_INTERNAL_TAIL(_Args) \
-  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_TAIL _Args)
-
-#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C(...) 1 _
-#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_1 1,
-#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C \
-  0,
-#define GMOCK_PP_INTERNAL_REMOVE_PARENS(...) __VA_ARGS__
-#define GMOCK_PP_INTERNAL_INC_0 1
-#define GMOCK_PP_INTERNAL_INC_1 2
-#define GMOCK_PP_INTERNAL_INC_2 3
-#define GMOCK_PP_INTERNAL_INC_3 4
-#define GMOCK_PP_INTERNAL_INC_4 5
-#define GMOCK_PP_INTERNAL_INC_5 6
-#define GMOCK_PP_INTERNAL_INC_6 7
-#define GMOCK_PP_INTERNAL_INC_7 8
-#define GMOCK_PP_INTERNAL_INC_8 9
-#define GMOCK_PP_INTERNAL_INC_9 10
-#define GMOCK_PP_INTERNAL_INC_10 11
-#define GMOCK_PP_INTERNAL_INC_11 12
-#define GMOCK_PP_INTERNAL_INC_12 13
-#define GMOCK_PP_INTERNAL_INC_13 14
-#define GMOCK_PP_INTERNAL_INC_14 15
-#define GMOCK_PP_INTERNAL_INC_15 16
-#define GMOCK_PP_INTERNAL_COMMA_IF_0
-#define GMOCK_PP_INTERNAL_COMMA_IF_1 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_2 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_3 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_4 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_5 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_6 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_7 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_8 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_9 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_10 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_11 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_12 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_13 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_14 ,
-#define GMOCK_PP_INTERNAL_COMMA_IF_15 ,
-#define GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, _element) \
-  _Macro(_i, _Data, _element)
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_0(_i, _Macro, _Data, _Tuple)
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(_i, _Macro, _Data, _Tuple) \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple)
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(_i, _Macro, _Data, _Tuple)    \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(GMOCK_PP_INC(_i), _Macro, _Data,    \
-                                    (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(GMOCK_PP_INC(_i), _Macro, _Data,   \
-                                     (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(GMOCK_PP_INC(_i), _Macro, _Data,   \
-                                     (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(GMOCK_PP_INC(_i), _Macro, _Data,   \
-                                     (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(GMOCK_PP_INC(_i), _Macro, _Data,   \
-                                     (GMOCK_PP_TAIL _Tuple))
-#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_15(_i, _Macro, _Data, _Tuple)   \
-  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
-  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(GMOCK_PP_INC(_i), _Macro, _Data,   \
-                                     (GMOCK_PP_TAIL _Tuple))
-
-#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
diff --git a/third_party/googletest/src/googlemock/src/gmock-all.cc b/third_party/googletest/src/googlemock/src/gmock-all.cc
deleted file mode 100644
index e43c9b7b4c..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock-all.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// Google C++ Mocking Framework (Google Mock)
-//
-// This file #includes all Google Mock implementation .cc files.  The
-// purpose is to allow a user to build Google Mock by compiling this
-// file alone.
-
-// This line ensures that gmock.h can be compiled on its own, even
-// when it's fused.
-#include "gmock/gmock.h"
-
-// The following lines pull in the real gmock *.cc files.
-#include "src/gmock-cardinalities.cc"
-#include "src/gmock-internal-utils.cc"
-#include "src/gmock-matchers.cc"
-#include "src/gmock-spec-builders.cc"
-#include "src/gmock.cc"
diff --git a/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc b/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
deleted file mode 100644
index 92cde3484a..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements cardinalities.
-
-#include "gmock/gmock-cardinalities.h"
-
-#include <limits.h>
-
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-
-#include "gmock/internal/gmock-internal-utils.h"
-#include "gtest/gtest.h"
-
-namespace testing {
-
-namespace {
-
-// Implements the Between(m, n) cardinality.
-class BetweenCardinalityImpl : public CardinalityInterface {
- public:
-  BetweenCardinalityImpl(int min, int max)
-      : min_(min >= 0 ? min : 0), max_(max >= min_ ? max : min_) {
-    std::stringstream ss;
-    if (min < 0) {
-      ss << "The invocation lower bound must be >= 0, "
-         << "but is actually " << min << ".";
-      internal::Expect(false, __FILE__, __LINE__, ss.str());
-    } else if (max < 0) {
-      ss << "The invocation upper bound must be >= 0, "
-         << "but is actually " << max << ".";
-      internal::Expect(false, __FILE__, __LINE__, ss.str());
-    } else if (min > max) {
-      ss << "The invocation upper bound (" << max
-         << ") must be >= the invocation lower bound (" << min << ").";
-      internal::Expect(false, __FILE__, __LINE__, ss.str());
-    }
-  }
-
-  // Conservative estimate on the lower/upper bound of the number of
-  // calls allowed.
-  int ConservativeLowerBound() const override { return min_; }
-  int ConservativeUpperBound() const override { return max_; }
-
-  bool IsSatisfiedByCallCount(int call_count) const override {
-    return min_ <= call_count && call_count <= max_;
-  }
-
-  bool IsSaturatedByCallCount(int call_count) const override {
-    return call_count >= max_;
-  }
-
-  void DescribeTo(::std::ostream* os) const override;
-
- private:
-  const int min_;
-  const int max_;
-
-  BetweenCardinalityImpl(const BetweenCardinalityImpl&) = delete;
-  BetweenCardinalityImpl& operator=(const BetweenCardinalityImpl&) = delete;
-};
-
-// Formats "n times" in a human-friendly way.
-inline std::string FormatTimes(int n) {
-  if (n == 1) {
-    return "once";
-  } else if (n == 2) {
-    return "twice";
-  } else {
-    std::stringstream ss;
-    ss << n << " times";
-    return ss.str();
-  }
-}
-
-// Describes the Between(m, n) cardinality in human-friendly text.
-void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
-  if (min_ == 0) {
-    if (max_ == 0) {
-      *os << "never called";
-    } else if (max_ == INT_MAX) {
-      *os << "called any number of times";
-    } else {
-      *os << "called at most " << FormatTimes(max_);
-    }
-  } else if (min_ == max_) {
-    *os << "called " << FormatTimes(min_);
-  } else if (max_ == INT_MAX) {
-    *os << "called at least " << FormatTimes(min_);
-  } else {
-    // 0 < min_ < max_ < INT_MAX
-    *os << "called between " << min_ << " and " << max_ << " times";
-  }
-}
-
-}  // Unnamed namespace
-
-// Describes the given call count to an ostream.
-void Cardinality::DescribeActualCallCountTo(int actual_call_count,
-                                            ::std::ostream* os) {
-  if (actual_call_count > 0) {
-    *os << "called " << FormatTimes(actual_call_count);
-  } else {
-    *os << "never called";
-  }
-}
-
-// Creates a cardinality that allows at least n calls.
-GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
-
-// Creates a cardinality that allows at most n calls.
-GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
-
-// Creates a cardinality that allows any number of calls.
-GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
-
-// Creates a cardinality that allows between min and max calls.
-GTEST_API_ Cardinality Between(int min, int max) {
-  return Cardinality(new BetweenCardinalityImpl(min, max));
-}
-
-// Creates a cardinality that allows exactly n calls.
-GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
-
-}  // namespace testing
diff --git a/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc b/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
deleted file mode 100644
index 0a74841f35..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file defines some utilities useful for implementing Google
-// Mock.  They are subject to change without notice, so please DO NOT
-// USE THEM IN USER CODE.
-
-#include "gmock/internal/gmock-internal-utils.h"
-
-#include <ctype.h>
-
-#include <array>
-#include <cctype>
-#include <cstdint>
-#include <cstring>
-#include <ostream>  // NOLINT
-#include <string>
-#include <vector>
-
-#include "gmock/gmock.h"
-#include "gmock/internal/gmock-port.h"
-#include "gtest/gtest.h"
-
-namespace testing {
-namespace internal {
-
-// Joins a vector of strings as if they are fields of a tuple; returns
-// the joined string.
-GTEST_API_ std::string JoinAsKeyValueTuple(
-    const std::vector<const char*>& names, const Strings& values) {
-  GTEST_CHECK_(names.size() == values.size());
-  if (values.empty()) {
-    return "";
-  }
-  const auto build_one = [&](const size_t i) {
-    return std::string(names[i]) + ": " + values[i];
-  };
-  std::string result = "(" + build_one(0);
-  for (size_t i = 1; i < values.size(); i++) {
-    result += ", ";
-    result += build_one(i);
-  }
-  result += ")";
-  return result;
-}
-
-// Converts an identifier name to a space-separated list of lower-case
-// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
-// treated as one word.  For example, both "FooBar123" and
-// "foo_bar_123" are converted to "foo bar 123".
-GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
-  std::string result;
-  char prev_char = '\0';
-  for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
-    // We don't care about the current locale as the input is
-    // guaranteed to be a valid C++ identifier name.
-    const bool starts_new_word = IsUpper(*p) ||
-                                 (!IsAlpha(prev_char) && IsLower(*p)) ||
-                                 (!IsDigit(prev_char) && IsDigit(*p));
-
-    if (IsAlNum(*p)) {
-      if (starts_new_word && result != "") result += ' ';
-      result += ToLower(*p);
-    }
-  }
-  return result;
-}
-
-// This class reports Google Mock failures as Google Test failures.  A
-// user can define another class in a similar fashion if they intend to
-// use Google Mock with a testing framework other than Google Test.
-class GoogleTestFailureReporter : public FailureReporterInterface {
- public:
-  void ReportFailure(FailureType type, const char* file, int line,
-                     const std::string& message) override {
-    AssertHelper(type == kFatal ? TestPartResult::kFatalFailure
-                                : TestPartResult::kNonFatalFailure,
-                 file, line, message.c_str()) = Message();
-    if (type == kFatal) {
-      posix::Abort();
-    }
-  }
-};
-
-// Returns the global failure reporter.  Will create a
-// GoogleTestFailureReporter and return it the first time called.
-GTEST_API_ FailureReporterInterface* GetFailureReporter() {
-  // Points to the global failure reporter used by Google Mock.  gcc
-  // guarantees that the following use of failure_reporter is
-  // thread-safe.  We may need to add additional synchronization to
-  // protect failure_reporter if we port Google Mock to other
-  // compilers.
-  static FailureReporterInterface* const failure_reporter =
-      new GoogleTestFailureReporter();
-  return failure_reporter;
-}
-
-// Protects global resources (stdout in particular) used by Log().
-static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
-
-// Returns true if and only if a log with the given severity is visible
-// according to the --gmock_verbose flag.
-GTEST_API_ bool LogIsVisible(LogSeverity severity) {
-  if (GMOCK_FLAG_GET(verbose) == kInfoVerbosity) {
-    // Always show the log if --gmock_verbose=info.
-    return true;
-  } else if (GMOCK_FLAG_GET(verbose) == kErrorVerbosity) {
-    // Always hide it if --gmock_verbose=error.
-    return false;
-  } else {
-    // If --gmock_verbose is neither "info" nor "error", we treat it
-    // as "warning" (its default value).
-    return severity == kWarning;
-  }
-}
-
-// Prints the given message to stdout if and only if 'severity' >= the level
-// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
-// 0, also prints the stack trace excluding the top
-// stack_frames_to_skip frames.  In opt mode, any positive
-// stack_frames_to_skip is treated as 0, since we don't know which
-// function calls will be inlined by the compiler and need to be
-// conservative.
-GTEST_API_ void Log(LogSeverity severity, const std::string& message,
-                    int stack_frames_to_skip) {
-  if (!LogIsVisible(severity)) return;
-
-  // Ensures that logs from different threads don't interleave.
-  MutexLock l(&g_log_mutex);
-
-  if (severity == kWarning) {
-    // Prints a GMOCK WARNING marker to make the warnings easily searchable.
-    std::cout << "\nGMOCK WARNING:";
-  }
-  // Pre-pends a new-line to message if it doesn't start with one.
-  if (message.empty() || message[0] != '\n') {
-    std::cout << "\n";
-  }
-  std::cout << message;
-  if (stack_frames_to_skip >= 0) {
-#ifdef NDEBUG
-    // In opt mode, we have to be conservative and skip no stack frame.
-    const int actual_to_skip = 0;
-#else
-    // In dbg mode, we can do what the caller tell us to do (plus one
-    // for skipping this function's stack frame).
-    const int actual_to_skip = stack_frames_to_skip + 1;
-#endif  // NDEBUG
-
-    // Appends a new-line to message if it doesn't end with one.
-    if (!message.empty() && *message.rbegin() != '\n') {
-      std::cout << "\n";
-    }
-    std::cout << "Stack trace:\n"
-              << ::testing::internal::GetCurrentOsStackTraceExceptTop(
-                     ::testing::UnitTest::GetInstance(), actual_to_skip);
-  }
-  std::cout << ::std::flush;
-}
-
-GTEST_API_ WithoutMatchers GetWithoutMatchers() { return WithoutMatchers(); }
-
-GTEST_API_ void IllegalDoDefault(const char* file, int line) {
-  internal::Assert(
-      false, file, line,
-      "You are using DoDefault() inside a composite action like "
-      "DoAll() or WithArgs().  This is not supported for technical "
-      "reasons.  Please instead spell out the default action, or "
-      "assign the default action to an Action variable and use "
-      "the variable in various places.");
-}
-
-constexpr char UnBase64Impl(char c, const char* const base64, char carry) {
-  return *base64 == 0   ? static_cast<char>(65)
-         : *base64 == c ? carry
-                        : UnBase64Impl(c, base64 + 1, carry + 1);
-}
-
-template <size_t... I>
-constexpr std::array<char, 256> UnBase64Impl(IndexSequence<I...>,
-                                             const char* const base64) {
-  return {{UnBase64Impl(static_cast<char>(I), base64, 0)...}};
-}
-
-constexpr std::array<char, 256> UnBase64(const char* const base64) {
-  return UnBase64Impl(MakeIndexSequence<256>{}, base64);
-}
-
-static constexpr char kBase64[] =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-static constexpr std::array<char, 256> kUnBase64 = UnBase64(kBase64);
-
-bool Base64Unescape(const std::string& encoded, std::string* decoded) {
-  decoded->clear();
-  size_t encoded_len = encoded.size();
-  decoded->reserve(3 * (encoded_len / 4) + (encoded_len % 4));
-  int bit_pos = 0;
-  char dst = 0;
-  for (int src : encoded) {
-    if (std::isspace(src) || src == '=') {
-      continue;
-    }
-    char src_bin = kUnBase64[static_cast<size_t>(src)];
-    if (src_bin >= 64) {
-      decoded->clear();
-      return false;
-    }
-    if (bit_pos == 0) {
-      dst |= static_cast<char>(src_bin << 2);
-      bit_pos = 6;
-    } else {
-      dst |= static_cast<char>(src_bin >> (bit_pos - 2));
-      decoded->push_back(dst);
-      dst = static_cast<char>(src_bin << (10 - bit_pos));
-      bit_pos = (bit_pos + 6) % 8;
-    }
-  }
-  return true;
-}
-
-}  // namespace internal
-}  // namespace testing
diff --git a/third_party/googletest/src/googlemock/src/gmock-matchers.cc b/third_party/googletest/src/googlemock/src/gmock-matchers.cc
deleted file mode 100644
index a8d04a6da0..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock-matchers.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements Matcher<const string&>, Matcher<string>, and
-// utilities for defining matchers.
-
-#include "gmock/gmock-matchers.h"
-
-#include <string.h>
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace testing {
-namespace internal {
-
-// Returns the description for a matcher defined using the MATCHER*()
-// macro where the user-supplied description string is "", if
-// 'negation' is false; otherwise returns the description of the
-// negation of the matcher.  'param_values' contains a list of strings
-// that are the print-out of the matcher's parameters.
-GTEST_API_ std::string FormatMatcherDescription(
-    bool negation, const char* matcher_name,
-    const std::vector<const char*>& param_names, const Strings& param_values) {
-  std::string result = ConvertIdentifierNameToWords(matcher_name);
-  if (param_values.size() >= 1) {
-    result += " " + JoinAsKeyValueTuple(param_names, param_values);
-  }
-  return negation ? "not (" + result + ")" : result;
-}
-
-// FindMaxBipartiteMatching and its helper class.
-//
-// Uses the well-known Ford-Fulkerson max flow method to find a maximum
-// bipartite matching. Flow is considered to be from left to right.
-// There is an implicit source node that is connected to all of the left
-// nodes, and an implicit sink node that is connected to all of the
-// right nodes. All edges have unit capacity.
-//
-// Neither the flow graph nor the residual flow graph are represented
-// explicitly. Instead, they are implied by the information in 'graph' and
-// a vector<int> called 'left_' whose elements are initialized to the
-// value kUnused. This represents the initial state of the algorithm,
-// where the flow graph is empty, and the residual flow graph has the
-// following edges:
-//   - An edge from source to each left_ node
-//   - An edge from each right_ node to sink
-//   - An edge from each left_ node to each right_ node, if the
-//     corresponding edge exists in 'graph'.
-//
-// When the TryAugment() method adds a flow, it sets left_[l] = r for some
-// nodes l and r. This induces the following changes:
-//   - The edges (source, l), (l, r), and (r, sink) are added to the
-//     flow graph.
-//   - The same three edges are removed from the residual flow graph.
-//   - The reverse edges (l, source), (r, l), and (sink, r) are added
-//     to the residual flow graph, which is a directional graph
-//     representing unused flow capacity.
-//
-// When the method augments a flow (moving left_[l] from some r1 to some
-// other r2), this can be thought of as "undoing" the above steps with
-// respect to r1 and "redoing" them with respect to r2.
-//
-// It bears repeating that the flow graph and residual flow graph are
-// never represented explicitly, but can be derived by looking at the
-// information in 'graph' and in left_.
-//
-// As an optimization, there is a second vector<int> called right_ which
-// does not provide any new information. Instead, it enables more
-// efficient queries about edges entering or leaving the right-side nodes
-// of the flow or residual flow graphs. The following invariants are
-// maintained:
-//
-// left[l] == kUnused or right[left[l]] == l
-// right[r] == kUnused or left[right[r]] == r
-//
-// . [ source ]                                        .
-// .   |||                                             .
-// .   |||                                             .
-// .   ||\--> left[0]=1  ---\    right[0]=-1 ----\     .
-// .   ||                   |                    |     .
-// .   |\---> left[1]=-1    \--> right[1]=0  ---\|     .
-// .   |                                        ||     .
-// .   \----> left[2]=2  ------> right[2]=2  --\||     .
-// .                                           |||     .
-// .         elements           matchers       vvv     .
-// .                                         [ sink ]  .
-//
-// See Also:
-//   [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
-//       "Introduction to Algorithms (Second ed.)", pp. 651-664.
-//   [2] "Ford-Fulkerson algorithm", Wikipedia,
-//       'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
-class MaxBipartiteMatchState {
- public:
-  explicit MaxBipartiteMatchState(const MatchMatrix& graph)
-      : graph_(&graph),
-        left_(graph_->LhsSize(), kUnused),
-        right_(graph_->RhsSize(), kUnused) {}
-
-  // Returns the edges of a maximal match, each in the form {left, right}.
-  ElementMatcherPairs Compute() {
-    // 'seen' is used for path finding { 0: unseen, 1: seen }.
-    ::std::vector<char> seen;
-    // Searches the residual flow graph for a path from each left node to
-    // the sink in the residual flow graph, and if one is found, add flow
-    // to the graph. It's okay to search through the left nodes once. The
-    // edge from the implicit source node to each previously-visited left
-    // node will have flow if that left node has any path to the sink
-    // whatsoever. Subsequent augmentations can only add flow to the
-    // network, and cannot take away that previous flow unit from the source.
-    // Since the source-to-left edge can only carry one flow unit (or,
-    // each element can be matched to only one matcher), there is no need
-    // to visit the left nodes more than once looking for augmented paths.
-    // The flow is known to be possible or impossible by looking at the
-    // node once.
-    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
-      // Reset the path-marking vector and try to find a path from
-      // source to sink starting at the left_[ilhs] node.
-      GTEST_CHECK_(left_[ilhs] == kUnused)
-          << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
-      // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
-      seen.assign(graph_->RhsSize(), 0);
-      TryAugment(ilhs, &seen);
-    }
-    ElementMatcherPairs result;
-    for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
-      size_t irhs = left_[ilhs];
-      if (irhs == kUnused) continue;
-      result.push_back(ElementMatcherPair(ilhs, irhs));
-    }
-    return result;
-  }
-
- private:
-  static const size_t kUnused = static_cast<size_t>(-1);
-
-  // Perform a depth-first search from left node ilhs to the sink.  If a
-  // path is found, flow is added to the network by linking the left and
-  // right vector elements corresponding each segment of the path.
-  // Returns true if a path to sink was found, which means that a unit of
-  // flow was added to the network. The 'seen' vector elements correspond
-  // to right nodes and are marked to eliminate cycles from the search.
-  //
-  // Left nodes will only be explored at most once because they
-  // are accessible from at most one right node in the residual flow
-  // graph.
-  //
-  // Note that left_[ilhs] is the only element of left_ that TryAugment will
-  // potentially transition from kUnused to another value. Any other
-  // left_ element holding kUnused before TryAugment will be holding it
-  // when TryAugment returns.
-  //
-  bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
-    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
-      if ((*seen)[irhs]) continue;
-      if (!graph_->HasEdge(ilhs, irhs)) continue;
-      // There's an available edge from ilhs to irhs.
-      (*seen)[irhs] = 1;
-      // Next a search is performed to determine whether
-      // this edge is a dead end or leads to the sink.
-      //
-      // right_[irhs] == kUnused means that there is residual flow from
-      // right node irhs to the sink, so we can use that to finish this
-      // flow path and return success.
-      //
-      // Otherwise there is residual flow to some ilhs. We push flow
-      // along that path and call ourselves recursively to see if this
-      // ultimately leads to sink.
-      if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
-        // Add flow from left_[ilhs] to right_[irhs].
-        left_[ilhs] = irhs;
-        right_[irhs] = ilhs;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  const MatchMatrix* graph_;  // not owned
-  // Each element of the left_ vector represents a left hand side node
-  // (i.e. an element) and each element of right_ is a right hand side
-  // node (i.e. a matcher). The values in the left_ vector indicate
-  // outflow from that node to a node on the right_ side. The values
-  // in the right_ indicate inflow, and specify which left_ node is
-  // feeding that right_ node, if any. For example, left_[3] == 1 means
-  // there's a flow from element #3 to matcher #1. Such a flow would also
-  // be redundantly represented in the right_ vector as right_[1] == 3.
-  // Elements of left_ and right_ are either kUnused or mutually
-  // referent. Mutually referent means that left_[right_[i]] = i and
-  // right_[left_[i]] = i.
-  ::std::vector<size_t> left_;
-  ::std::vector<size_t> right_;
-};
-
-const size_t MaxBipartiteMatchState::kUnused;
-
-GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g) {
-  return MaxBipartiteMatchState(g).Compute();
-}
-
-static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
-                                     ::std::ostream* stream) {
-  typedef ElementMatcherPairs::const_iterator Iter;
-  ::std::ostream& os = *stream;
-  os << "{";
-  const char* sep = "";
-  for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
-    os << sep << "\n  ("
-       << "element #" << it->first << ", "
-       << "matcher #" << it->second << ")";
-    sep = ",";
-  }
-  os << "\n}";
-}
-
-bool MatchMatrix::NextGraph() {
-  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
-    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
-      char& b = matched_[SpaceIndex(ilhs, irhs)];
-      if (!b) {
-        b = 1;
-        return true;
-      }
-      b = 0;
-    }
-  }
-  return false;
-}
-
-void MatchMatrix::Randomize() {
-  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
-    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
-      char& b = matched_[SpaceIndex(ilhs, irhs)];
-      b = static_cast<char>(rand() & 1);  // NOLINT
-    }
-  }
-}
-
-std::string MatchMatrix::DebugString() const {
-  ::std::stringstream ss;
-  const char* sep = "";
-  for (size_t i = 0; i < LhsSize(); ++i) {
-    ss << sep;
-    for (size_t j = 0; j < RhsSize(); ++j) {
-      ss << HasEdge(i, j);
-    }
-    sep = ";";
-  }
-  return ss.str();
-}
-
-void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
-    ::std::ostream* os) const {
-  switch (match_flags()) {
-    case UnorderedMatcherRequire::ExactMatch:
-      if (matcher_describers_.empty()) {
-        *os << "is empty";
-        return;
-      }
-      if (matcher_describers_.size() == 1) {
-        *os << "has " << Elements(1) << " and that element ";
-        matcher_describers_[0]->DescribeTo(os);
-        return;
-      }
-      *os << "has " << Elements(matcher_describers_.size())
-          << " and there exists some permutation of elements such that:\n";
-      break;
-    case UnorderedMatcherRequire::Superset:
-      *os << "a surjection from elements to requirements exists such that:\n";
-      break;
-    case UnorderedMatcherRequire::Subset:
-      *os << "an injection from elements to requirements exists such that:\n";
-      break;
-  }
-
-  const char* sep = "";
-  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
-    *os << sep;
-    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
-      *os << " - element #" << i << " ";
-    } else {
-      *os << " - an element ";
-    }
-    matcher_describers_[i]->DescribeTo(os);
-    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
-      sep = ", and\n";
-    } else {
-      sep = "\n";
-    }
-  }
-}
-
-void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
-    ::std::ostream* os) const {
-  switch (match_flags()) {
-    case UnorderedMatcherRequire::ExactMatch:
-      if (matcher_describers_.empty()) {
-        *os << "isn't empty";
-        return;
-      }
-      if (matcher_describers_.size() == 1) {
-        *os << "doesn't have " << Elements(1) << ", or has " << Elements(1)
-            << " that ";
-        matcher_describers_[0]->DescribeNegationTo(os);
-        return;
-      }
-      *os << "doesn't have " << Elements(matcher_describers_.size())
-          << ", or there exists no permutation of elements such that:\n";
-      break;
-    case UnorderedMatcherRequire::Superset:
-      *os << "no surjection from elements to requirements exists such that:\n";
-      break;
-    case UnorderedMatcherRequire::Subset:
-      *os << "no injection from elements to requirements exists such that:\n";
-      break;
-  }
-  const char* sep = "";
-  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
-    *os << sep;
-    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
-      *os << " - element #" << i << " ";
-    } else {
-      *os << " - an element ";
-    }
-    matcher_describers_[i]->DescribeTo(os);
-    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
-      sep = ", and\n";
-    } else {
-      sep = "\n";
-    }
-  }
-}
-
-// Checks that all matchers match at least one element, and that all
-// elements match at least one matcher. This enables faster matching
-// and better error reporting.
-// Returns false, writing an explanation to 'listener', if and only
-// if the success criteria are not met.
-bool UnorderedElementsAreMatcherImplBase::VerifyMatchMatrix(
-    const ::std::vector<std::string>& element_printouts,
-    const MatchMatrix& matrix, MatchResultListener* listener) const {
-  bool result = true;
-  ::std::vector<char> element_matched(matrix.LhsSize(), 0);
-  ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
-
-  for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
-    for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
-      char matched = matrix.HasEdge(ilhs, irhs);
-      element_matched[ilhs] |= matched;
-      matcher_matched[irhs] |= matched;
-    }
-  }
-
-  if (match_flags() & UnorderedMatcherRequire::Superset) {
-    const char* sep =
-        "where the following matchers don't match any elements:\n";
-    for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
-      if (matcher_matched[mi]) continue;
-      result = false;
-      if (listener->IsInterested()) {
-        *listener << sep << "matcher #" << mi << ": ";
-        matcher_describers_[mi]->DescribeTo(listener->stream());
-        sep = ",\n";
-      }
-    }
-  }
-
-  if (match_flags() & UnorderedMatcherRequire::Subset) {
-    const char* sep =
-        "where the following elements don't match any matchers:\n";
-    const char* outer_sep = "";
-    if (!result) {
-      outer_sep = "\nand ";
-    }
-    for (size_t ei = 0; ei < element_matched.size(); ++ei) {
-      if (element_matched[ei]) continue;
-      result = false;
-      if (listener->IsInterested()) {
-        *listener << outer_sep << sep << "element #" << ei << ": "
-                  << element_printouts[ei];
-        sep = ",\n";
-        outer_sep = "";
-      }
-    }
-  }
-  return result;
-}
-
-bool UnorderedElementsAreMatcherImplBase::FindPairing(
-    const MatchMatrix& matrix, MatchResultListener* listener) const {
-  ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
-
-  size_t max_flow = matches.size();
-  if ((match_flags() & UnorderedMatcherRequire::Superset) &&
-      max_flow < matrix.RhsSize()) {
-    if (listener->IsInterested()) {
-      *listener << "where no permutation of the elements can satisfy all "
-                   "matchers, and the closest match is "
-                << max_flow << " of " << matrix.RhsSize()
-                << " matchers with the pairings:\n";
-      LogElementMatcherPairVec(matches, listener->stream());
-    }
-    return false;
-  }
-  if ((match_flags() & UnorderedMatcherRequire::Subset) &&
-      max_flow < matrix.LhsSize()) {
-    if (listener->IsInterested()) {
-      *listener
-          << "where not all elements can be matched, and the closest match is "
-          << max_flow << " of " << matrix.RhsSize()
-          << " matchers with the pairings:\n";
-      LogElementMatcherPairVec(matches, listener->stream());
-    }
-    return false;
-  }
-
-  if (matches.size() > 1) {
-    if (listener->IsInterested()) {
-      const char* sep = "where:\n";
-      for (size_t mi = 0; mi < matches.size(); ++mi) {
-        *listener << sep << " - element #" << matches[mi].first
-                  << " is matched by matcher #" << matches[mi].second;
-        sep = ",\n";
-      }
-    }
-  }
-  return true;
-}
-
-}  // namespace internal
-}  // namespace testing
diff --git a/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc b/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
deleted file mode 100644
index 658ad3fa22..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
+++ /dev/null
@@ -1,781 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Google Mock - a framework for writing C++ mock classes.
-//
-// This file implements the spec builder syntax (ON_CALL and
-// EXPECT_CALL).
-
-#include "gmock/gmock-spec-builders.h"
-
-#include <stdlib.h>
-
-#include <iostream>  // NOLINT
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
-#include <unistd.h>  // NOLINT
-#endif
-
-// Silence C4800 (C4800: 'int *const ': forcing value
-// to bool 'true' or 'false') for MSVC 15
-#ifdef _MSC_VER
-#if _MSC_VER == 1900
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
-#endif
-
-namespace testing {
-namespace internal {
-
-// Protects the mock object registry (in class Mock), all function
-// mockers, and all expectations.
-GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
-
-// Logs a message including file and line number information.
-GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
-                                const char* file, int line,
-                                const std::string& message) {
-  ::std::ostringstream s;
-  s << internal::FormatFileLocation(file, line) << " " << message
-    << ::std::endl;
-  Log(severity, s.str(), 0);
-}
-
-// Constructs an ExpectationBase object.
-ExpectationBase::ExpectationBase(const char* a_file, int a_line,
-                                 const std::string& a_source_text)
-    : file_(a_file),
-      line_(a_line),
-      source_text_(a_source_text),
-      cardinality_specified_(false),
-      cardinality_(Exactly(1)),
-      call_count_(0),
-      retired_(false),
-      extra_matcher_specified_(false),
-      repeated_action_specified_(false),
-      retires_on_saturation_(false),
-      last_clause_(kNone),
-      action_count_checked_(false) {}
-
-// Destructs an ExpectationBase object.
-ExpectationBase::~ExpectationBase() {}
-
-// Explicitly specifies the cardinality of this expectation.  Used by
-// the subclasses to implement the .Times() clause.
-void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
-  cardinality_specified_ = true;
-  cardinality_ = a_cardinality;
-}
-
-// Retires all pre-requisites of this expectation.
-void ExpectationBase::RetireAllPreRequisites()
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-  if (is_retired()) {
-    // We can take this short-cut as we never retire an expectation
-    // until we have retired all its pre-requisites.
-    return;
-  }
-
-  ::std::vector<ExpectationBase*> expectations(1, this);
-  while (!expectations.empty()) {
-    ExpectationBase* exp = expectations.back();
-    expectations.pop_back();
-
-    for (ExpectationSet::const_iterator it =
-             exp->immediate_prerequisites_.begin();
-         it != exp->immediate_prerequisites_.end(); ++it) {
-      ExpectationBase* next = it->expectation_base().get();
-      if (!next->is_retired()) {
-        next->Retire();
-        expectations.push_back(next);
-      }
-    }
-  }
-}
-
-// Returns true if and only if all pre-requisites of this expectation
-// have been satisfied.
-bool ExpectationBase::AllPrerequisitesAreSatisfied() const
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-  g_gmock_mutex.AssertHeld();
-  ::std::vector<const ExpectationBase*> expectations(1, this);
-  while (!expectations.empty()) {
-    const ExpectationBase* exp = expectations.back();
-    expectations.pop_back();
-
-    for (ExpectationSet::const_iterator it =
-             exp->immediate_prerequisites_.begin();
-         it != exp->immediate_prerequisites_.end(); ++it) {
-      const ExpectationBase* next = it->expectation_base().get();
-      if (!next->IsSatisfied()) return false;
-      expectations.push_back(next);
-    }
-  }
-  return true;
-}
-
-// Adds unsatisfied pre-requisites of this expectation to 'result'.
-void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-  g_gmock_mutex.AssertHeld();
-  ::std::vector<const ExpectationBase*> expectations(1, this);
-  while (!expectations.empty()) {
-    const ExpectationBase* exp = expectations.back();
-    expectations.pop_back();
-
-    for (ExpectationSet::const_iterator it =
-             exp->immediate_prerequisites_.begin();
-         it != exp->immediate_prerequisites_.end(); ++it) {
-      const ExpectationBase* next = it->expectation_base().get();
-
-      if (next->IsSatisfied()) {
-        // If *it is satisfied and has a call count of 0, some of its
-        // pre-requisites may not be satisfied yet.
-        if (next->call_count_ == 0) {
-          expectations.push_back(next);
-        }
-      } else {
-        // Now that we know next is unsatisfied, we are not so interested
-        // in whether its pre-requisites are satisfied.  Therefore we
-        // don't iterate into it here.
-        *result += *it;
-      }
-    }
-  }
-}
-
-// Describes how many times a function call matching this
-// expectation has occurred.
-void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-  g_gmock_mutex.AssertHeld();
-
-  // Describes how many times the function is expected to be called.
-  *os << "         Expected: to be ";
-  cardinality().DescribeTo(os);
-  *os << "\n           Actual: ";
-  Cardinality::DescribeActualCallCountTo(call_count(), os);
-
-  // Describes the state of the expectation (e.g. is it satisfied?
-  // is it active?).
-  *os << " - "
-      << (IsOverSaturated() ? "over-saturated"
-          : IsSaturated()   ? "saturated"
-          : IsSatisfied()   ? "satisfied"
-                            : "unsatisfied")
-      << " and " << (is_retired() ? "retired" : "active");
-}
-
-// Checks the action count (i.e. the number of WillOnce() and
-// WillRepeatedly() clauses) against the cardinality if this hasn't
-// been done before.  Prints a warning if there are too many or too
-// few actions.
-void ExpectationBase::CheckActionCountIfNotDone() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  bool should_check = false;
-  {
-    MutexLock l(&mutex_);
-    if (!action_count_checked_) {
-      action_count_checked_ = true;
-      should_check = true;
-    }
-  }
-
-  if (should_check) {
-    if (!cardinality_specified_) {
-      // The cardinality was inferred - no need to check the action
-      // count against it.
-      return;
-    }
-
-    // The cardinality was explicitly specified.
-    const int action_count = static_cast<int>(untyped_actions_.size());
-    const int upper_bound = cardinality().ConservativeUpperBound();
-    const int lower_bound = cardinality().ConservativeLowerBound();
-    bool too_many;  // True if there are too many actions, or false
-    // if there are too few.
-    if (action_count > upper_bound ||
-        (action_count == upper_bound && repeated_action_specified_)) {
-      too_many = true;
-    } else if (0 < action_count && action_count < lower_bound &&
-               !repeated_action_specified_) {
-      too_many = false;
-    } else {
-      return;
-    }
-
-    ::std::stringstream ss;
-    DescribeLocationTo(&ss);
-    ss << "Too " << (too_many ? "many" : "few") << " actions specified in "
-       << source_text() << "...\n"
-       << "Expected to be ";
-    cardinality().DescribeTo(&ss);
-    ss << ", but has " << (too_many ? "" : "only ") << action_count
-       << " WillOnce()" << (action_count == 1 ? "" : "s");
-    if (repeated_action_specified_) {
-      ss << " and a WillRepeatedly()";
-    }
-    ss << ".";
-    Log(kWarning, ss.str(), -1);  // -1 means "don't print stack trace".
-  }
-}
-
-// Implements the .Times() clause.
-void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
-  if (last_clause_ == kTimes) {
-    ExpectSpecProperty(false,
-                       ".Times() cannot appear "
-                       "more than once in an EXPECT_CALL().");
-  } else {
-    ExpectSpecProperty(
-        last_clause_ < kTimes,
-        ".Times() may only appear *before* .InSequence(), .WillOnce(), "
-        ".WillRepeatedly(), or .RetiresOnSaturation(), not after.");
-  }
-  last_clause_ = kTimes;
-
-  SpecifyCardinality(a_cardinality);
-}
-
-// Points to the implicit sequence introduced by a living InSequence
-// object (if any) in the current thread or NULL.
-GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
-
-// Reports an uninteresting call (whose description is in msg) in the
-// manner specified by 'reaction'.
-void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
-  // Include a stack trace only if --gmock_verbose=info is specified.
-  const int stack_frames_to_skip =
-      GMOCK_FLAG_GET(verbose) == kInfoVerbosity ? 3 : -1;
-  switch (reaction) {
-    case kAllow:
-      Log(kInfo, msg, stack_frames_to_skip);
-      break;
-    case kWarn:
-      Log(kWarning,
-          msg +
-              "\nNOTE: You can safely ignore the above warning unless this "
-              "call should not happen.  Do not suppress it by blindly adding "
-              "an EXPECT_CALL() if you don't mean to enforce the call.  "
-              "See "
-              "https://github.com/google/googletest/blob/master/docs/"
-              "gmock_cook_book.md#"
-              "knowing-when-to-expect for details.\n",
-          stack_frames_to_skip);
-      break;
-    default:  // FAIL
-      Expect(false, nullptr, -1, msg);
-  }
-}
-
-UntypedFunctionMockerBase::UntypedFunctionMockerBase()
-    : mock_obj_(nullptr), name_("") {}
-
-UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
-
-// Sets the mock object this mock method belongs to, and registers
-// this information in the global mock registry.  Will be called
-// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
-// method.
-void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  {
-    MutexLock l(&g_gmock_mutex);
-    mock_obj_ = mock_obj;
-  }
-  Mock::Register(mock_obj, this);
-}
-
-// Sets the mock object this mock method belongs to, and sets the name
-// of the mock function.  Will be called upon each invocation of this
-// mock function.
-void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
-                                                const char* name)
-    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  // We protect name_ under g_gmock_mutex in case this mock function
-  // is called from two threads concurrently.
-  MutexLock l(&g_gmock_mutex);
-  mock_obj_ = mock_obj;
-  name_ = name;
-}
-
-// Returns the name of the function being mocked.  Must be called
-// after RegisterOwner() or SetOwnerAndName() has been called.
-const void* UntypedFunctionMockerBase::MockObject() const
-    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  const void* mock_obj;
-  {
-    // We protect mock_obj_ under g_gmock_mutex in case this mock
-    // function is called from two threads concurrently.
-    MutexLock l(&g_gmock_mutex);
-    Assert(mock_obj_ != nullptr, __FILE__, __LINE__,
-           "MockObject() must not be called before RegisterOwner() or "
-           "SetOwnerAndName() has been called.");
-    mock_obj = mock_obj_;
-  }
-  return mock_obj;
-}
-
-// Returns the name of this mock method.  Must be called after
-// SetOwnerAndName() has been called.
-const char* UntypedFunctionMockerBase::Name() const
-    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
-  const char* name;
-  {
-    // We protect name_ under g_gmock_mutex in case this mock
-    // function is called from two threads concurrently.
-    MutexLock l(&g_gmock_mutex);
-    Assert(name_ != nullptr, __FILE__, __LINE__,
-           "Name() must not be called before SetOwnerAndName() has "
-           "been called.");
-    name = name_;
-  }
-  return name;
-}
-
-// Returns an Expectation object that references and co-owns exp,
-// which must be an expectation on this mock function.
-Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
-  // See the definition of untyped_expectations_ for why access to it
-  // is unprotected here.
-  for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
-       it != untyped_expectations_.end(); ++it) {
-    if (it->get() == exp) {
-      return Expectation(*it);
-    }
-  }
-
-  Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
-  return Expectation();
-  // The above statement is just to make the code compile, and will
-  // never be executed.
-}
-
-// Verifies that all expectations on this mock function have been
-// satisfied.  Reports one or more Google Test non-fatal failures
-// and returns false if not.
-bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
-  g_gmock_mutex.AssertHeld();
-  bool expectations_met = true;
-  for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
-       it != untyped_expectations_.end(); ++it) {
-    ExpectationBase* const untyped_expectation = it->get();
-    if (untyped_expectation->IsOverSaturated()) {
-      // There was an upper-bound violation.  Since the error was
-      // already reported when it occurred, there is no need to do
-      // anything here.
-      expectations_met = false;
-    } else if (!untyped_expectation->IsSatisfied()) {
-      expectations_met = false;
-      ::std::stringstream ss;
-      ss << "Actual function call count doesn't match "
-         << untyped_expectation->source_text() << "...\n";
-      // No need to show the source file location of the expectation
-      // in the description, as the Expect() call that follows already
-      // takes care of it.
-      untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
-      untyped_expectation->DescribeCallCountTo(&ss);
-      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
-             ss.str());
-    }
-  }
-
-  // Deleting our expectations may trigger other mock objects to be deleted, for
-  // example if an action contains a reference counted smart pointer to that
-  // mock object, and that is the last reference. So if we delete our
-  // expectations within the context of the global mutex we may deadlock when
-  // this method is called again. Instead, make a copy of the set of
-  // expectations to delete, clear our set within the mutex, and then clear the
-  // copied set outside of it.
-  UntypedExpectations expectations_to_delete;
-  untyped_expectations_.swap(expectations_to_delete);
-
-  g_gmock_mutex.Unlock();
-  expectations_to_delete.clear();
-  g_gmock_mutex.Lock();
-
-  return expectations_met;
-}
-
-CallReaction intToCallReaction(int mock_behavior) {
-  if (mock_behavior >= kAllow && mock_behavior <= kFail) {
-    return static_cast<internal::CallReaction>(mock_behavior);
-  }
-  return kWarn;
-}
-
-}  // namespace internal
-
-// Class Mock.
-
-namespace {
-
-typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
-
-// The current state of a mock object.  Such information is needed for
-// detecting leaked mock objects and explicitly verifying a mock's
-// expectations.
-struct MockObjectState {
-  MockObjectState()
-      : first_used_file(nullptr), first_used_line(-1), leakable(false) {}
-
-  // Where in the source file an ON_CALL or EXPECT_CALL is first
-  // invoked on this mock object.
-  const char* first_used_file;
-  int first_used_line;
-  ::std::string first_used_test_suite;
-  ::std::string first_used_test;
-  bool leakable;  // true if and only if it's OK to leak the object.
-  FunctionMockers function_mockers;  // All registered methods of the object.
-};
-
-// A global registry holding the state of all mock objects that are
-// alive.  A mock object is added to this registry the first time
-// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it.  It
-// is removed from the registry in the mock object's destructor.
-class MockObjectRegistry {
- public:
-  // Maps a mock object (identified by its address) to its state.
-  typedef std::map<const void*, MockObjectState> StateMap;
-
-  // This destructor will be called when a program exits, after all
-  // tests in it have been run.  By then, there should be no mock
-  // object alive.  Therefore we report any living object as test
-  // failure, unless the user explicitly asked us to ignore it.
-  ~MockObjectRegistry() {
-    if (!GMOCK_FLAG_GET(catch_leaked_mocks)) return;
-
-    int leaked_count = 0;
-    for (StateMap::const_iterator it = states_.begin(); it != states_.end();
-         ++it) {
-      if (it->second.leakable)  // The user said it's fine to leak this object.
-        continue;
-
-      // FIXME: Print the type of the leaked object.
-      // This can help the user identify the leaked object.
-      std::cout << "\n";
-      const MockObjectState& state = it->second;
-      std::cout << internal::FormatFileLocation(state.first_used_file,
-                                                state.first_used_line);
-      std::cout << " ERROR: this mock object";
-      if (state.first_used_test != "") {
-        std::cout << " (used in test " << state.first_used_test_suite << "."
-                  << state.first_used_test << ")";
-      }
-      std::cout << " should be deleted but never is. Its address is @"
-                << it->first << ".";
-      leaked_count++;
-    }
-    if (leaked_count > 0) {
-      std::cout << "\nERROR: " << leaked_count << " leaked mock "
-                << (leaked_count == 1 ? "object" : "objects")
-                << " found at program exit. Expectations on a mock object are "
-                   "verified when the object is destructed. Leaking a mock "
-                   "means that its expectations aren't verified, which is "
-                   "usually a test bug. If you really intend to leak a mock, "
-                   "you can suppress this error using "
-                   "testing::Mock::AllowLeak(mock_object), or you may use a "
-                   "fake or stub instead of a mock.\n";
-      std::cout.flush();
-      ::std::cerr.flush();
-      // RUN_ALL_TESTS() has already returned when this destructor is
-      // called.  Therefore we cannot use the normal Google Test
-      // failure reporting mechanism.
-      _exit(1);  // We cannot call exit() as it is not reentrant and
-                 // may already have been called.
-    }
-  }
-
-  StateMap& states() { return states_; }
-
- private:
-  StateMap states_;
-};
-
-// Protected by g_gmock_mutex.
-MockObjectRegistry g_mock_object_registry;
-
-// Maps a mock object to the reaction Google Mock should have when an
-// uninteresting method is called.  Protected by g_gmock_mutex.
-std::unordered_map<uintptr_t, internal::CallReaction>&
-UninterestingCallReactionMap() {
-  static auto* map = new std::unordered_map<uintptr_t, internal::CallReaction>;
-  return *map;
-}
-
-// Sets the reaction Google Mock should have when an uninteresting
-// method of the given mock object is called.
-void SetReactionOnUninterestingCalls(uintptr_t mock_obj,
-                                     internal::CallReaction reaction)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  UninterestingCallReactionMap()[mock_obj] = reaction;
-}
-
-}  // namespace
-
-// Tells Google Mock to allow uninteresting calls on the given mock
-// object.
-void Mock::AllowUninterestingCalls(uintptr_t mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
-}
-
-// Tells Google Mock to warn the user about uninteresting calls on the
-// given mock object.
-void Mock::WarnUninterestingCalls(uintptr_t mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
-}
-
-// Tells Google Mock to fail uninteresting calls on the given mock
-// object.
-void Mock::FailUninterestingCalls(uintptr_t mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
-}
-
-// Tells Google Mock the given mock object is being destroyed and its
-// entry in the call-reaction table should be removed.
-void Mock::UnregisterCallReaction(uintptr_t mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  UninterestingCallReactionMap().erase(static_cast<uintptr_t>(mock_obj));
-}
-
-// Returns the reaction Google Mock will have on uninteresting calls
-// made on the given mock object.
-internal::CallReaction Mock::GetReactionOnUninterestingCalls(
-    const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  return (UninterestingCallReactionMap().count(
-              reinterpret_cast<uintptr_t>(mock_obj)) == 0)
-             ? internal::intToCallReaction(
-                   GMOCK_FLAG_GET(default_mock_behavior))
-             : UninterestingCallReactionMap()[reinterpret_cast<uintptr_t>(
-                   mock_obj)];
-}
-
-// Tells Google Mock to ignore mock_obj when checking for leaked mock
-// objects.
-void Mock::AllowLeak(const void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  g_mock_object_registry.states()[mock_obj].leakable = true;
-}
-
-// Verifies and clears all expectations on the given mock object.  If
-// the expectations aren't satisfied, generates one or more Google
-// Test non-fatal failures and returns false.
-bool Mock::VerifyAndClearExpectations(void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  return VerifyAndClearExpectationsLocked(mock_obj);
-}
-
-// Verifies all expectations on the given mock object and clears its
-// default actions and expectations.  Returns true if and only if the
-// verification was successful.
-bool Mock::VerifyAndClear(void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  ClearDefaultActionsLocked(mock_obj);
-  return VerifyAndClearExpectationsLocked(mock_obj);
-}
-
-// Verifies and clears all expectations on the given mock object.  If
-// the expectations aren't satisfied, generates one or more Google
-// Test non-fatal failures and returns false.
-bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
-  internal::g_gmock_mutex.AssertHeld();
-  if (g_mock_object_registry.states().count(mock_obj) == 0) {
-    // No EXPECT_CALL() was set on the given mock object.
-    return true;
-  }
-
-  // Verifies and clears the expectations on each mock method in the
-  // given mock object.
-  bool expectations_met = true;
-  FunctionMockers& mockers =
-      g_mock_object_registry.states()[mock_obj].function_mockers;
-  for (FunctionMockers::const_iterator it = mockers.begin();
-       it != mockers.end(); ++it) {
-    if (!(*it)->VerifyAndClearExpectationsLocked()) {
-      expectations_met = false;
-    }
-  }
-
-  // We don't clear the content of mockers, as they may still be
-  // needed by ClearDefaultActionsLocked().
-  return expectations_met;
-}
-
-bool Mock::IsNaggy(void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kWarn;
-}
-bool Mock::IsNice(void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kAllow;
-}
-bool Mock::IsStrict(void* mock_obj)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kFail;
-}
-
-// Registers a mock object and a mock method it owns.
-void Mock::Register(const void* mock_obj,
-                    internal::UntypedFunctionMockerBase* mocker)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
-}
-
-// Tells Google Mock where in the source code mock_obj is used in an
-// ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
-// information helps the user identify which object it is.
-void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
-                                           const char* file, int line)
-    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
-  internal::MutexLock l(&internal::g_gmock_mutex);
-  MockObjectState& state = g_mock_object_registry.states()[mock_obj];
-  if (state.first_used_file == nullptr) {
-    state.first_used_file = file;
-    state.first_used_line = line;
-    const TestInfo* const test_info =
-        UnitTest::GetInstance()->current_test_info();
-    if (test_info != nullptr) {
-      state.first_used_test_suite = test_info->test_suite_name();
-      state.first_used_test = test_info->name();
-    }
-  }
-}
-
-// Unregisters a mock method; removes the owning mock object from the
-// registry when the last mock method associated with it has been
-// unregistered.  This is called only in the destructor of
-// FunctionMockerBase.
-void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
-  internal::g_gmock_mutex.AssertHeld();
-  for (MockObjectRegistry::StateMap::iterator it =
-           g_mock_object_registry.states().begin();
-       it != g_mock_object_registry.states().end(); ++it) {
-    FunctionMockers& mockers = it->second.function_mockers;
-    if (mockers.erase(mocker) > 0) {
-      // mocker was in mockers and has been just removed.
-      if (mockers.empty()) {
-        g_mock_object_registry.states().erase(it);
-      }
-      return;
-    }
-  }
-}
-
-// Clears all ON_CALL()s set on the given mock object.
-void Mock::ClearDefaultActionsLocked(void* mock_obj)
-    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
-  internal::g_gmock_mutex.AssertHeld();
-
-  if (g_mock_object_registry.states().count(mock_obj) == 0) {
-    // No ON_CALL() was set on the given mock object.
-    return;
-  }
-
-  // Clears the default actions for each mock method in the given mock
-  // object.
-  FunctionMockers& mockers =
-      g_mock_object_registry.states()[mock_obj].function_mockers;
-  for (FunctionMockers::const_iterator it = mockers.begin();
-       it != mockers.end(); ++it) {
-    (*it)->ClearDefaultActionsLocked();
-  }
-
-  // We don't clear the content of mockers, as they may still be
-  // needed by VerifyAndClearExpectationsLocked().
-}
-
-Expectation::Expectation() {}
-
-Expectation::Expectation(
-    const std::shared_ptr<internal::ExpectationBase>& an_expectation_base)
-    : expectation_base_(an_expectation_base) {}
-
-Expectation::~Expectation() {}
-
-// Adds an expectation to a sequence.
-void Sequence::AddExpectation(const Expectation& expectation) const {
-  if (*last_expectation_ != expectation) {
-    if (last_expectation_->expectation_base() != nullptr) {
-      expectation.expectation_base()->immediate_prerequisites_ +=
-          *last_expectation_;
-    }
-    *last_expectation_ = expectation;
-  }
-}
-
-// Creates the implicit sequence if there isn't one.
-InSequence::InSequence() {
-  if (internal::g_gmock_implicit_sequence.get() == nullptr) {
-    internal::g_gmock_implicit_sequence.set(new Sequence);
-    sequence_created_ = true;
-  } else {
-    sequence_created_ = false;
-  }
-}
-
-// Deletes the implicit sequence if it was created by the constructor
-// of this object.
-InSequence::~InSequence() {
-  if (sequence_created_) {
-    delete internal::g_gmock_implicit_sequence.get();
-    internal::g_gmock_implicit_sequence.set(nullptr);
-  }
-}
-
-}  // namespace testing
-
-#ifdef _MSC_VER
-#if _MSC_VER == 1900
-#pragma warning(pop)
-#endif
-#endif
diff --git a/third_party/googletest/src/googlemock/src/gmock.cc b/third_party/googletest/src/googlemock/src/gmock.cc
deleted file mode 100644
index 5025656a02..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "gmock/gmock.h"
-
-#include "gmock/internal/gmock-port.h"
-
-GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
-                   "true if and only if Google Mock should report leaked "
-                   "mock objects as failures.");
-
-GMOCK_DEFINE_string_(verbose, testing::internal::kWarningVerbosity,
-                     "Controls how verbose Google Mock's output is."
-                     "  Valid values:\n"
-                     "  info    - prints all messages.\n"
-                     "  warning - prints warnings and errors.\n"
-                     "  error   - prints errors only.");
-
-GMOCK_DEFINE_int32_(default_mock_behavior, 1,
-                    "Controls the default behavior of mocks."
-                    "  Valid values:\n"
-                    "  0 - by default, mocks act as NiceMocks.\n"
-                    "  1 - by default, mocks act as NaggyMocks.\n"
-                    "  2 - by default, mocks act as StrictMocks.");
-
-namespace testing {
-namespace internal {
-
-// Parses a string as a command line flag.  The string should have the
-// format "--gmock_flag=value".  When def_optional is true, the
-// "=value" part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseGoogleMockFlagValue(const char* str,
-                                            const char* flag_name,
-                                            bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == nullptr || flag_name == nullptr) return nullptr;
-
-  // The flag must start with "--gmock_".
-  const std::string flag_name_str = std::string("--gmock_") + flag_name;
-  const size_t flag_name_len = flag_name_str.length();
-  if (strncmp(str, flag_name_str.c_str(), flag_name_len) != 0) return nullptr;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_name_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a Google Mock bool flag, in the form of
-// "--gmock_flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
-                                bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for a Google Mock string flag, in the form of
-// "--gmock_flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-template <typename String>
-static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
-                                String* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
-                                int32_t* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
-                    value);
-}
-
-// The internal implementation of InitGoogleMock().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleMockImpl(int* argc, CharType** argv) {
-  // Makes sure Google Test is initialized.  InitGoogleTest() is
-  // idempotent, so it's fine if the user has already called it.
-  InitGoogleTest(argc, argv);
-  if (*argc <= 0) return;
-
-  for (int i = 1; i != *argc; i++) {
-    const std::string arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    // Do we see a Google Mock flag?
-    bool found_gmock_flag = false;
-
-#define GMOCK_INTERNAL_PARSE_FLAG(flag_name)            \
-  if (!found_gmock_flag) {                              \
-    auto value = GMOCK_FLAG_GET(flag_name);             \
-    if (ParseGoogleMockFlag(arg, #flag_name, &value)) { \
-      GMOCK_FLAG_SET(flag_name, value);                 \
-      found_gmock_flag = true;                          \
-    }                                                   \
-  }
-
-    GMOCK_INTERNAL_PARSE_FLAG(catch_leaked_mocks)
-    GMOCK_INTERNAL_PARSE_FLAG(verbose)
-    GMOCK_INTERNAL_PARSE_FLAG(default_mock_behavior)
-
-    if (found_gmock_flag) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    }
-  }
-}
-
-}  // namespace internal
-
-// Initializes Google Mock.  This must be called before running the
-// tests.  In particular, it parses a command line for the flags that
-// Google Mock recognizes.  Whenever a Google Mock flag is seen, it is
-// removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Mock flag variables are
-// updated.
-//
-// Since Google Test is needed for Google Mock to work, this function
-// also initializes Google Test and parses its flags, if that hasn't
-// been done.
-GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
-  internal::InitGoogleMockImpl(argc, argv);
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
-  internal::InitGoogleMockImpl(argc, argv);
-}
-
-// This overloaded version can be used on Arduino/embedded platforms where
-// there is no argc/argv.
-GTEST_API_ void InitGoogleMock() {
-  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
-  int argc = 1;
-  const auto arg0 = "dummy";
-  char* argv0 = const_cast<char*>(arg0);
-  char** argv = &argv0;
-
-  internal::InitGoogleMockImpl(&argc, argv);
-}
-
-}  // namespace testing
diff --git a/third_party/googletest/src/googlemock/src/gmock_main.cc b/third_party/googletest/src/googlemock/src/gmock_main.cc
deleted file mode 100644
index b411c5ecb9..0000000000
--- a/third_party/googletest/src/googlemock/src/gmock_main.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <iostream>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
-#if GTEST_OS_ESP8266
-extern "C" {
-#endif
-void setup() {
-  // Since Google Mock depends on Google Test, InitGoogleMock() is
-  // also responsible for initializing Google Test.  Therefore there's
-  // no need for calling testing::InitGoogleTest() separately.
-  testing::InitGoogleMock();
-}
-void loop() { RUN_ALL_TESTS(); }
-#if GTEST_OS_ESP8266
-}
-#endif
-
-#else
-
-// MS C++ compiler/linker has a bug on Windows (not on Windows CE), which
-// causes a link error when _tmain is defined in a static library and UNICODE
-// is enabled. For this reason instead of _tmain, main function is used on
-// Windows. See the following link to track the current status of this bug:
-// https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
-// // NOLINT
-#if GTEST_OS_WINDOWS_MOBILE
-#include <tchar.h>  // NOLINT
-
-GTEST_API_ int _tmain(int argc, TCHAR** argv) {
-#else
-GTEST_API_ int main(int argc, char** argv) {
-#endif  // GTEST_OS_WINDOWS_MOBILE
-  std::cout << "Running main() from gmock_main.cc\n";
-  // Since Google Mock depends on Google Test, InitGoogleMock() is
-  // also responsible for initializing Google Test.  Therefore there's
-  // no need for calling testing::InitGoogleTest() separately.
-  testing::InitGoogleMock(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-#endif
-- 
GitLab


From c5bf7f732d0e9651f60eb101cd34dac951a61d20 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 06:56:51 -0700
Subject: [PATCH 300/391] Test AOM_VALGRIND_BUILD with #ifdef or defined()

Fix -Wundef warnings. The AOM_VALGRIND_BUILD macro is not defined in
aom/aom_config.h, so it should be tested with #ifdef or defined().

See https://aomedia-review.googlesource.com/c/aom/+/137841 and
https://aomedia-review.googlesource.com/c/aom/+/137861.

Bug: aomedia:356830476
Change-Id: Ia4e2a9fe1d436fcf9ec4cec3fa65e7207f861a54
---
 test/ethread_test.cc | 8 ++++----
 test/kf_test.cc      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index da3be3fb2d..552ea18b22 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -470,7 +470,7 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
 
 // The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the
 // Valgrind long tests. Exclude it; the smaller tests are still run.
-#if !AOM_VALGRIND_BUILD
+#if !defined(AOM_VALGRIND_BUILD)
 class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
 
 TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
@@ -486,7 +486,7 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
                            ::testing::Values(0, 1, 3, 5),
                            ::testing::Values(1, 6), ::testing::Values(1, 6),
                            ::testing::Values(0, 1));
-#endif  // !AOM_VALGRIND_BUILD
+#endif  // !defined(AOM_VALGRIND_BUILD)
 
 TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
@@ -570,7 +570,7 @@ TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
 // AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of
 // the Valgrind long tests. Since we already run AVxEncoderThreadLSTest,
 // skip this one for Valgrind.
-#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
+#if !CONFIG_REALTIME_ONLY && !defined(AOM_VALGRIND_BUILD)
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
 TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
@@ -585,5 +585,5 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge,
                                              ::libaom_test::kOnePassGood),
                            ::testing::Values(1, 3), ::testing::Values(0, 6),
                            ::testing::Values(0, 6), ::testing::Values(1));
-#endif  // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
+#endif  // !CONFIG_REALTIME_ONLY && !defined(AOM_VALGRIND_BUILD)
 }  // namespace
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 14fc064efe..0ae3c64407 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -190,7 +190,7 @@ class KeyFrameIntervalTestLarge
 // Because valgrind builds take a very long time to run, use a lower
 // resolution video for valgrind runs.
 const char *TestFileName() {
-#if AOM_VALGRIND_BUILD
+#ifdef AOM_VALGRIND_BUILD
   return "hantro_collage_w176h144.yuv";
 #else
   return "hantro_collage_w352h288.yuv";
@@ -198,7 +198,7 @@ const char *TestFileName() {
 }
 
 int TestFileWidth() {
-#if AOM_VALGRIND_BUILD
+#ifdef AOM_VALGRIND_BUILD
   return 176;
 #else
   return 352;
@@ -206,7 +206,7 @@ int TestFileWidth() {
 }
 
 int TestFileHeight() {
-#if AOM_VALGRIND_BUILD
+#ifdef AOM_VALGRIND_BUILD
   return 144;
 #else
   return 288;
-- 
GitLab


From 5283110d61204e95f4540040fd29c2e058da18a2 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 06:35:35 -0700
Subject: [PATCH 301/391] Test defined(CHROMIUM) to fix the -Wundef warning

The CHROMIUM macro is only defined when libaom is compiled as part of
the Chromium source tree, so we need to test it with defined(CHROMIUM)
to avoid the -Wundef warning.

This issue was introduced in
https://aomedia-review.googlesource.com/c/aom/+/174001.

Bug: aomedia:356830476
Change-Id: Ib53b19aa2e6a2447b201dde8a41abce08c20ce04
---
 test/invalid_file_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index ca173959db..f4343b190d 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -132,7 +132,7 @@ const DecodeParam kAV1InvalidFileTests[] = {
   { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
   { 1, "invalid-oss-fuzz-9720.ivf", nullptr },
   { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.4" },
-#if !CHROMIUM && !CONFIG_SIZE_LIMIT ||                  \
+#if !defined(CHROMIUM) && !CONFIG_SIZE_LIMIT ||         \
     (CONFIG_SIZE_LIMIT && DECODE_WIDTH_LIMIT >= 5120 && \
      DECODE_HEIGHT_LIMIT >= 180)
   { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
-- 
GitLab


From 0a5353aac2a7c05d0ceccade798a29e4ac37cbfe Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 12:53:11 -0700
Subject: [PATCH 302/391] Update third_party/libwebm to commit f4b07ec

Bug: aomedia:356830476
Change-Id: I1070ef1683bb6fc5ffe4d7a7f3dbf532ed9c0253
---
 third_party/libwebm/README.libaom            | 2 +-
 third_party/libwebm/common/hdr_util.cc       | 3 ++-
 third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 2 +-
 third_party/libwebm/mkvmuxer/mkvwriter.cc    | 8 +++++++-
 third_party/libwebm/mkvparser/mkvreader.cc   | 8 +++++++-
 5 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/third_party/libwebm/README.libaom b/third_party/libwebm/README.libaom
index 1eb0ce9a94..a038418acd 100644
--- a/third_party/libwebm/README.libaom
+++ b/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: affd7f4d9644aa2b65981fa6c7616400be760e6e
+Version: f4b07ec144e61d9089144e8d54b4ecda0219c562
 License: BSD
 License File: LICENSE.TXT
 
diff --git a/third_party/libwebm/common/hdr_util.cc b/third_party/libwebm/common/hdr_util.cc
index 916f7170b6..f1320a5361 100644
--- a/third_party/libwebm/common/hdr_util.cc
+++ b/third_party/libwebm/common/hdr_util.cc
@@ -202,7 +202,8 @@ bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
       features->bit_depth = priv_profile;
     } else if (id_byte == kVp9ChromaSubsamplingId) {
       const int priv_profile = static_cast<int>(private_data[offset++]);
-      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+      if (priv_profile != 0 && priv_profile != 1 && priv_profile != 2 &&
+          priv_profile != 3)
         return false;
       if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
           features->chroma_subsampling != priv_profile) {
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index f538310e21..d1e835cd00 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -621,7 +621,7 @@ uint64 MakeUID(unsigned int* seed) {
 #ifdef _WIN32
     (void)seed;
     const int32 nn = rand();
-#elif __ANDROID__
+#elif defined(__ANDROID__)
     (void)seed;
     int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
diff --git a/third_party/libwebm/mkvmuxer/mkvwriter.cc b/third_party/libwebm/mkvmuxer/mkvwriter.cc
index d668384d85..9b714a5e7c 100644
--- a/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -80,8 +80,14 @@ int32 MkvWriter::Position(int64 position) {
   return _fseeki64(file_, position, SEEK_SET);
 #elif defined(_WIN32)
   return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
-#else
+#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+        defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+  // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+  // Android API level 24. See
+  // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md
   return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
+#else
+  return fseek(file_, static_cast<long>(position), SEEK_SET);
 #endif
 }
 
diff --git a/third_party/libwebm/mkvparser/mkvreader.cc b/third_party/libwebm/mkvparser/mkvreader.cc
index 9d19c1be56..467260402a 100644
--- a/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/third_party/libwebm/mkvparser/mkvreader.cc
@@ -120,8 +120,14 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
     return -1;  // error
 #elif defined(_WIN32)
   fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
-#else
+#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+        defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+  // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+  // Android API level 24. See
+  // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md
   fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
+#else
+  fseek(m_file, static_cast<long>(offset), SEEK_SET);
 #endif
 
   const size_t size = fread(buffer, 1, len, m_file);
-- 
GitLab


From c8b1fc25822cbefc25d703f1334f854c77ab831c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 12:09:28 -0700
Subject: [PATCH 303/391] Include "gtest/gtest.h" using the shorter path

Rely on the -I or -system compiler option to find "gtest/gtest.h". This
makes it easier to build our tests against a copy of gtest outside the
libaom source tree.

Bug: aomedia:356830476, webm:42330726
Change-Id: I9e2c9302d1ff67471523be24da3d2c047447d7e1
---
 test/accounting_test.cc                    | 2 +-
 test/acm_random.h                          | 2 +-
 test/active_map_test.cc                    | 2 +-
 test/allintra_end_to_end_test.cc           | 2 +-
 test/altref_test.cc                        | 2 +-
 test/aom_image_test.cc                     | 2 +-
 test/aom_integer_test.cc                   | 2 +-
 test/aom_mem_test.cc                       | 2 +-
 test/aq_segment_test.cc                    | 2 +-
 test/arf_freq_test.cc                      | 2 +-
 test/av1_common_int_test.cc                | 2 +-
 test/av1_config_test.cc                    | 2 +-
 test/av1_convolve_scale_test.cc            | 2 +-
 test/av1_convolve_test.cc                  | 4 ++--
 test/av1_encoder_parms_get_to_decoder.cc   | 2 +-
 test/av1_ext_tile_test.cc                  | 2 +-
 test/av1_external_partition_test.cc        | 2 +-
 test/av1_highbd_iht_test.cc                | 2 +-
 test/av1_horz_only_frame_superres_test.cc  | 2 +-
 test/av1_k_means_test.cc                   | 4 ++--
 test/av1_key_value_api_test.cc             | 2 +-
 test/av1_nn_predict_test.cc                | 2 +-
 test/av1_quantize_test.cc                  | 2 +-
 test/av1_round_shift_array_test.cc         | 2 +-
 test/av1_softmax_test.cc                   | 2 +-
 test/av1_txfm_test.h                       | 2 +-
 test/av1_wedge_utils_test.cc               | 2 +-
 test/avg_test.cc                           | 2 +-
 test/avif_progressive_test.cc              | 2 +-
 test/binary_codes_test.cc                  | 2 +-
 test/blend_a64_mask_1d_test.cc             | 2 +-
 test/blend_a64_mask_test.cc                | 2 +-
 test/block_test.cc                         | 2 +-
 test/boolcoder_test.cc                     | 2 +-
 test/borders_test.cc                       | 2 +-
 test/cdef_test.cc                          | 2 +-
 test/cfl_test.cc                           | 2 +-
 test/cnn_test.cc                           | 2 +-
 test/coding_path_sync.cc                   | 2 +-
 test/comp_avg_pred_test.cc                 | 2 +-
 test/comp_mask_pred_test.cc                | 2 +-
 test/convolve_test.cc                      | 2 +-
 test/corner_match_test.cc                  | 2 +-
 test/cpu_speed_test.cc                     | 2 +-
 test/datarate_test.cc                      | 2 +-
 test/datarate_test.h                       | 2 +-
 test/decode_api_test.cc                    | 2 +-
 test/decode_multithreaded_test.cc          | 2 +-
 test/decode_scalability_test.cc            | 2 +-
 test/decode_test_driver.cc                 | 2 +-
 test/decode_test_driver.h                  | 2 +-
 test/deltaq_mode_test.cc                   | 2 +-
 test/disflow_test.cc                       | 2 +-
 test/divu_small_test.cc                    | 2 +-
 test/dr_prediction_test.cc                 | 2 +-
 test/ec_test.cc                            | 2 +-
 test/encode_api_test.cc                    | 2 +-
 test/encode_perf_test.cc                   | 2 +-
 test/encode_small_width_height_test.cc     | 2 +-
 test/encode_test_driver.cc                 | 2 +-
 test/encode_test_driver.h                  | 2 +-
 test/encodemb_test.cc                      | 2 +-
 test/encodetxb_test.cc                     | 2 +-
 test/end_to_end_psnr_test.cc               | 2 +-
 test/end_to_end_qmpsnr_test.cc             | 2 +-
 test/end_to_end_ssim_test.cc               | 2 +-
 test/error_block_test.cc                   | 2 +-
 test/error_resilience_test.cc              | 2 +-
 test/ethread_test.cc                       | 2 +-
 test/fdct4x4_test.cc                       | 2 +-
 test/fft_test.cc                           | 2 +-
 test/film_grain_table_test.cc              | 2 +-
 test/filterintra_test.cc                   | 2 +-
 test/firstpass_test.cc                     | 2 +-
 test/force_key_frame_test.cc               | 2 +-
 test/forced_max_frame_width_height_test.cc | 2 +-
 test/frame_parallel_enc_test.cc            | 2 +-
 test/frame_resize_test.cc                  | 6 +++---
 test/frame_size_tests.cc                   | 2 +-
 test/function_equivalence_test.h           | 2 +-
 test/fwht4x4_test.cc                       | 2 +-
 test/gf_pyr_height_test.cc                 | 2 +-
 test/hadamard_test.cc                      | 2 +-
 test/hash_test.cc                          | 2 +-
 test/hbd_metrics_test.cc                   | 2 +-
 test/hiprec_convolve_test.cc               | 2 +-
 test/hiprec_convolve_test_util.h           | 2 +-
 test/horver_correlation_test.cc            | 2 +-
 test/horz_superres_test.cc                 | 2 +-
 test/intra_edge_test.cc                    | 2 +-
 test/intrabc_test.cc                       | 2 +-
 test/intrapred_test.cc                     | 2 +-
 test/invalid_file_test.cc                  | 2 +-
 test/kf_test.cc                            | 2 +-
 test/level_test.cc                         | 2 +-
 test/log2_test.cc                          | 2 +-
 test/loopfilter_control_test.cc            | 2 +-
 test/lossless_test.cc                      | 2 +-
 test/lpf_test.cc                           | 2 +-
 test/masked_sad_test.cc                    | 2 +-
 test/masked_variance_test.cc               | 2 +-
 test/metadata_test.cc                      | 2 +-
 test/minmax_test.cc                        | 2 +-
 test/monochrome_test.cc                    | 2 +-
 test/motion_vector_test.cc                 | 2 +-
 test/mv_cost_test.cc                       | 2 +-
 test/noise_model_test.cc                   | 2 +-
 test/obmc_sad_test.cc                      | 2 +-
 test/obmc_variance_test.cc                 | 2 +-
 test/pickrst_test.cc                       | 2 +-
 test/quant_test.cc                         | 4 ++--
 test/quantize_func_test.cc                 | 2 +-
 test/ratectrl_rtc_test.cc                  | 4 ++--
 test/ratectrl_test.cc                      | 2 +-
 test/rd_test.cc                            | 2 +-
 test/reconinter_test.cc                    | 2 +-
 test/register_state_check.h                | 2 +-
 test/resize_test.cc                        | 2 +-
 test/rt_end_to_end_test.cc                 | 2 +-
 test/sad_test.cc                           | 2 +-
 test/sb_multipass_test.cc                  | 2 +-
 test/sb_qp_sweep_test.cc                   | 2 +-
 test/scalability_test.cc                   | 2 +-
 test/scan_test.cc                          | 2 +-
 test/screen_content_test.cc                | 2 +-
 test/segment_binarization_sync.cc          | 2 +-
 test/selfguided_filter_test.cc             | 2 +-
 test/sharpness_test.cc                     | 2 +-
 test/simd_impl.h                           | 4 ++--
 test/sse_sum_test.cc                       | 2 +-
 test/still_picture_test.cc                 | 2 +-
 test/subtract_test.cc                      | 2 +-
 test/sum_squares_test.cc                   | 2 +-
 test/svc_datarate_test.cc                  | 2 +-
 test/temporal_filter_test.cc               | 2 +-
 test/test_aom_rc.cc                        | 2 +-
 test/test_intra_pred_speed.cc              | 2 +-
 test/test_libaom.cc                        | 2 +-
 test/test_vector_test.cc                   | 2 +-
 test/tile_config_test.cc                   | 2 +-
 test/tile_independence_test.cc             | 2 +-
 test/time_stamp_test.cc                    | 2 +-
 test/tpl_model_test.cc                     | 2 +-
 test/transform_test_base.h                 | 2 +-
 test/util.h                                | 2 +-
 test/variance_test.cc                      | 2 +-
 test/video_source.h                        | 2 +-
 test/warp_filter_test.cc                   | 2 +-
 test/warp_filter_test_util.h               | 2 +-
 test/webmenc_test.cc                       | 2 +-
 test/wiener_test.cc                        | 2 +-
 test/y4m_test.cc                           | 2 +-
 152 files changed, 159 insertions(+), 159 deletions(-)

diff --git a/test/accounting_test.cc b/test/accounting_test.cc
index a861c59026..2c90a67b8a 100644
--- a/test/accounting_test.cc
+++ b/test/accounting_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "aom/aom_integer.h"
diff --git a/test/acm_random.h b/test/acm_random.h
index 56b229e0e6..6fb6d566ae 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -12,7 +12,7 @@
 #ifndef AOM_TEST_ACM_RANDOM_H_
 #define AOM_TEST_ACM_RANDOM_H_
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "aom/aom_integer.h"
 
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index b9c44c13b6..fd5d61916b 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -11,7 +11,7 @@
 
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index c234cb54c3..e588f610a1 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -12,7 +12,7 @@
 #include <memory>
 #include <ostream>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/altref_test.cc b/test/altref_test.cc
index ea5f302a6b..354a5a8b63 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
index d9727911d6..5976061fa9 100644
--- a/test/aom_image_test.cc
+++ b/test/aom_image_test.cc
@@ -12,7 +12,7 @@
 #include <climits>
 
 #include "aom/aom_image.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 TEST(AomImageTest, AomImgWrapInvalidAlign) {
   const int kWidth = 128;
diff --git a/test/aom_integer_test.cc b/test/aom_integer_test.cc
index 43dbbd03e7..0092a8d821 100644
--- a/test/aom_integer_test.cc
+++ b/test/aom_integer_test.cc
@@ -10,7 +10,7 @@
  */
 
 #include "aom/aom_integer.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 const uint64_t kMaximumLeb128CodedSize = 8;
diff --git a/test/aom_mem_test.cc b/test/aom_mem_test.cc
index be0be00e73..2496342940 100644
--- a/test/aom_mem_test.cc
+++ b/test/aom_mem_test.cc
@@ -14,7 +14,7 @@
 #include <cstdio>
 #include <cstddef>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 TEST(AomMemTest, Overflow) {
   // Allocations are aligned > 1 so SIZE_MAX should always fail.
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index f97bad274f..9e8e42d022 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -11,7 +11,7 @@
 
 #include "config/aom_config.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index b38f038c7a..1236ce2577 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc
@@ -11,7 +11,7 @@
 
 #include <memory>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/av1_common_int_test.cc b/test/av1_common_int_test.cc
index fb8a6b0d23..36be364fb2 100644
--- a/test/av1_common_int_test.cc
+++ b/test/av1_common_int_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "av1/common/av1_common_int.h"
 
diff --git a/test/av1_config_test.cc b/test/av1_config_test.cc
index 1e89f9d4d6..a198a56f3b 100644
--- a/test/av1_config_test.cc
+++ b/test/av1_config_test.cc
@@ -11,8 +11,8 @@
 #include <string.h>
 
 #include "common/av1_config.h"
+#include "gtest/gtest.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index ac723017bb..a8344feb91 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -12,7 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 8a5c166134..d20a125fe0 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -16,9 +16,9 @@
 #include <vector>
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
-#include "test/acm_random.h"
 #include "aom_ports/aom_timer.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/acm_random.h"
 
 namespace {
 
diff --git a/test/av1_encoder_parms_get_to_decoder.cc b/test/av1_encoder_parms_get_to_decoder.cc
index b690f3b648..2867754152 100644
--- a/test/av1_encoder_parms_get_to_decoder.cc
+++ b/test/av1_encoder_parms_get_to_decoder.cc
@@ -11,7 +11,7 @@
 
 #include <memory>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 7048927cd9..1eb6ced758 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc
@@ -12,7 +12,7 @@
 #include <assert.h>
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/av1_external_partition_test.cc b/test/av1_external_partition_test.cc
index 10c130683e..0fe2dde488 100644
--- a/test/av1_external_partition_test.cc
+++ b/test/av1_external_partition_test.cc
@@ -18,7 +18,7 @@
 #include "aom/aom_external_partition.h"
 #include "av1/common/blockd.h"
 #include "av1/encoder/encodeframe_utils.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/y4m_video_source.h"
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 649e367751..24cf9b03eb 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index d72ce1ea1b..2430a1a059 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -12,7 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index ce08986e5f..db73847210 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -21,11 +21,11 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "test/acm_random.h"
 #include "av1/encoder/palette.h"
+#include "gtest/gtest.h"
+#include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace AV1Kmeans {
 typedef void (*av1_calc_indices_dim1_func)(const int16_t *data,
diff --git a/test/av1_key_value_api_test.cc b/test/av1_key_value_api_test.cc
index a25fbcf050..03cdeee523 100644
--- a/test/av1_key_value_api_test.cc
+++ b/test/av1_key_value_api_test.cc
@@ -18,7 +18,7 @@
 #include "aom/aomcx.h"
 #include "aom/aomdx.h"
 #include "config/aom_config.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 typedef std::tuple<const char *, const char *> KeyValParam;
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index dc714920b7..9f6e8fac9c 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/aom_timer.h"
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 3e6697186b..101186bc3e 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -10,7 +10,7 @@
  */
 #include <stdlib.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index 67cacaf67d..e9731b1d37 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc
@@ -19,9 +19,9 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace AV1CompRoundShift {
 
diff --git a/test/av1_softmax_test.cc b/test/av1_softmax_test.cc
index 928b5857e5..18af945eb9 100644
--- a/test/av1_softmax_test.cc
+++ b/test/av1_softmax_test.cc
@@ -19,10 +19,10 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 using FastSoftmaxFn = void (*)(const float *const input, float *output);
diff --git a/test/av1_txfm_test.h b/test/av1_txfm_test.h
index 337304d7ed..0a78ca34aa 100644
--- a/test/av1_txfm_test.h
+++ b/test/av1_txfm_test.h
@@ -21,7 +21,7 @@
 
 #include "config/av1_rtcd.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "av1/common/av1_txfm.h"
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index 4967e9cb48..af11494d59 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 21c34ccc36..d1698fc1fa 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/avif_progressive_test.cc b/test/avif_progressive_test.cc
index d75f7063d8..ba0b159741 100644
--- a/test/avif_progressive_test.cc
+++ b/test/avif_progressive_test.cc
@@ -16,7 +16,7 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 #include "aom/aom_image.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/binary_codes_test.cc b/test/binary_codes_test.cc
index e9dc85e88f..c62e320ec2 100644
--- a/test/binary_codes_test.cc
+++ b/test/binary_codes_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index e6e4de1ca2..feee2d4add 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"
 
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 3538677b43..43d0162c58 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"
 
diff --git a/test/block_test.cc b/test/block_test.cc
index 6714393ec3..9600a2bb66 100644
--- a/test/block_test.cc
+++ b/test/block_test.cc
@@ -11,7 +11,7 @@
 
 #include "aom/aom_codec.h"
 #include "av1/common/blockd.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/y4m_video_source.h"
diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc
index c36980cc7b..de2ba659b3 100644
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "aom/aom_integer.h"
diff --git a/test/borders_test.cc b/test/borders_test.cc
index df793b6c95..a97d33ec4d 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -11,7 +11,7 @@
 
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index 601b0384c6..7ce278c05c 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -15,7 +15,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index bde346a378..e093c4e354 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/cnn_test.cc b/test/cnn_test.cc
index f57e26d0f8..3012451de8 100644
--- a/test/cnn_test.cc
+++ b/test/cnn_test.cc
@@ -13,7 +13,7 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
index 79098376d4..a4e95e3165 100644
--- a/test/coding_path_sync.cc
+++ b/test/coding_path_sync.cc
@@ -10,7 +10,7 @@
  */
 
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 
 #include "config/aom_config.h"
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index b9375d14b6..18c077b1ae 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -14,7 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index 6b70ca23ff..953e48180f 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -24,10 +24,10 @@
 #include "aom_ports/mem.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 9d3fa452b2..09e5f64faf 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -12,7 +12,7 @@
 #include <string.h>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index 76f5345950..a805329ec7 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -14,7 +14,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index a6ab219ae0..422bb74ad7 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index facd86e36b..a66c90e517 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -11,7 +11,7 @@
 
 #include "config/aom_config.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/datarate_test.h"
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 5b5c45e1a5..9c88ef528c 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -11,7 +11,7 @@
 
 #include "config/aom_config.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index e07b3a30d7..f504be053b 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index 18086a2d6a..b0bd9d4cae 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc
@@ -14,12 +14,12 @@
 #include <string>
 
 #include "aom_mem/aom_mem.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/decode_scalability_test.cc b/test/decode_scalability_test.cc
index 2102ddb8c2..85cea91193 100644
--- a/test/decode_scalability_test.cc
+++ b/test/decode_scalability_test.cc
@@ -11,12 +11,12 @@
 
 #include <ostream>
 
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/ivf_video_source.h"
 #include "test/util.h"
 #include "test/video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index acfc882ce7..22a87ef434 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 7812d708ec..4fa6c29fca 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #ifndef AOM_TEST_DECODE_TEST_DRIVER_H_
 #define AOM_TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/deltaq_mode_test.cc b/test/deltaq_mode_test.cc
index 424af81405..35b697d550 100644
--- a/test/deltaq_mode_test.cc
+++ b/test/deltaq_mode_test.cc
@@ -18,7 +18,7 @@
 #include "aom/aom_encoder.h"
 #include "aom/aom_image.h"
 #include "config/aom_config.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/disflow_test.cc b/test/disflow_test.cc
index 5c07b0758e..d881f7baa4 100644
--- a/test/disflow_test.cc
+++ b/test/disflow_test.cc
@@ -11,7 +11,7 @@
 
 #include "aom_dsp/flow_estimation/disflow.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
diff --git a/test/divu_small_test.cc b/test/divu_small_test.cc
index 7ddb22d8fd..4587803c8a 100644
--- a/test/divu_small_test.cc
+++ b/test/divu_small_test.cc
@@ -11,7 +11,7 @@
 
 #include <stdlib.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "aom_dsp/odintrin.h"
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 0938a3db11..de90ec7bd6 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -12,7 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/ec_test.cc b/test/ec_test.cc
index b60ccbf8b6..5c80c9822c 100644
--- a/test/ec_test.cc
+++ b/test/ec_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include <cstdlib>
 #include <memory>
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 36e79145f6..4a714584b3 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -16,7 +16,7 @@
 #include <cstring>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index ba78a98c74..4f3806fdf1 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -10,7 +10,7 @@
  */
 
 #include <string>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "aom/aom_codec.h"
 #include "aom_ports/aom_timer.h"
diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
index 893e5ede35..2d705a75de 100644
--- a/test/encode_small_width_height_test.cc
+++ b/test/encode_small_width_height_test.cc
@@ -20,7 +20,7 @@
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
 #include "config/aom_config.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index fb5ddcae7a..7bd956d133 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -12,7 +12,7 @@
 #include <memory>
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 9a66e4424f..d665af9950 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/encodemb_test.cc b/test/encodemb_test.cc
index f84b23efb7..c148a715de 100644
--- a/test/encodemb_test.cc
+++ b/test/encodemb_test.cc
@@ -12,7 +12,7 @@
 #include <stdint.h>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "av1/encoder/block.h"
 #include "av1/encoder/encodemb.h"
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 30cf053d5b..55047e74c1 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -14,7 +14,7 @@
 #include <string.h>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
diff --git a/test/end_to_end_psnr_test.cc b/test/end_to_end_psnr_test.cc
index 722ea54dd5..0daa44df93 100644
--- a/test/end_to_end_psnr_test.cc
+++ b/test/end_to_end_psnr_test.cc
@@ -12,7 +12,7 @@
 #include <memory>
 #include <ostream>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/end_to_end_qmpsnr_test.cc b/test/end_to_end_qmpsnr_test.cc
index e5d8afd88b..c5911ffe2b 100644
--- a/test/end_to_end_qmpsnr_test.cc
+++ b/test/end_to_end_qmpsnr_test.cc
@@ -14,11 +14,11 @@
 #include "aom_ports/mem.h"
 #include "aom_dsp/ssim.h"
 #include "av1/common/blockd.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/end_to_end_ssim_test.cc b/test/end_to_end_ssim_test.cc
index ee13e0fc24..2b3fb87ef7 100644
--- a/test/end_to_end_ssim_test.cc
+++ b/test/end_to_end_ssim_test.cc
@@ -12,11 +12,11 @@
 #include "aom_ports/mem.h"
 #include "aom_dsp/ssim.h"
 #include "av1/common/blockd.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 45d363e16e..2139537238 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index ef10ee85fc..efe4c49630 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 552ea18b22..b366422e2a 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -11,7 +11,7 @@
 
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 1fb3a9a214..eceaf27562 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,7 +15,7 @@
 #include <tuple>
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/fft_test.cc b/test/fft_test.cc
index b110601af7..1d3d59adec 100644
--- a/test/fft_test.cc
+++ b/test/fft_test.cc
@@ -20,8 +20,8 @@
 #include "aom_mem/aom_mem.h"
 #include "av1/common/common.h"
 #include "config/aom_dsp_rtcd.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc
index ca6f1be9e0..33a709785f 100644
--- a/test/film_grain_table_test.cc
+++ b/test/film_grain_table_test.cc
@@ -10,7 +10,7 @@
  */
 
 #include <string>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "aom_dsp/grain_table.h"
 #include "aom/internal/aom_codec_internal.h"
 #include "av1/encoder/grain_test_vectors.h"
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index 1d6c7f7989..d425287b35 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/firstpass_test.cc b/test/firstpass_test.cc
index 020f6b3b03..33aa8acd48 100644
--- a/test/firstpass_test.cc
+++ b/test/firstpass_test.cc
@@ -13,7 +13,7 @@
 
 #include "av1/common/common.h"
 #include "av1/encoder/firstpass.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/force_key_frame_test.cc b/test/force_key_frame_test.cc
index f00b80307d..48ebcb22f8 100644
--- a/test/force_key_frame_test.cc
+++ b/test/force_key_frame_test.cc
@@ -19,7 +19,7 @@
 
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/forced_max_frame_width_height_test.cc b/test/forced_max_frame_width_height_test.cc
index ccde456d1b..5211d665b9 100644
--- a/test/forced_max_frame_width_height_test.cc
+++ b/test/forced_max_frame_width_height_test.cc
@@ -22,7 +22,7 @@
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
 #include "config/aom_config.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/frame_parallel_enc_test.cc b/test/frame_parallel_enc_test.cc
index cb694296af..68c8271aa1 100644
--- a/test/frame_parallel_enc_test.cc
+++ b/test/frame_parallel_enc_test.cc
@@ -11,7 +11,7 @@
 
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 029a823c1f..a9acb247db 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -13,11 +13,11 @@
 #include <new>
 
 #include "config/av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/util.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/bitops.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
 
 namespace {
 
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 32d9a45514..65269aaa8b 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -12,7 +12,7 @@
 #include <array>
 #include <memory>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/video_source.h"
 #include "test/util.h"
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
index e8f9835253..6d15fa0465 100644
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h
@@ -14,7 +14,7 @@
 
 #include <ostream>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index 630bd141c7..8e9600a26e 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -15,7 +15,7 @@
 #include <tuple>
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/gf_pyr_height_test.cc b/test/gf_pyr_height_test.cc
index a38330bbbf..0ce5d33ac6 100644
--- a/test/gf_pyr_height_test.cc
+++ b/test/gf_pyr_height_test.cc
@@ -11,7 +11,7 @@
 
 #include <ostream>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index aa9ceb08ad..b867d0a91b 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -12,7 +12,7 @@
 #include <algorithm>
 #include <ostream>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/test/hash_test.cc b/test/hash_test.cc
index 5fd68a52ae..c82b8cdbc7 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -18,9 +18,9 @@
 
 #include "aom_ports/aom_timer.h"
 #include "av1/encoder/hash.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 7650f7f44d..af050a0646 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -14,7 +14,7 @@
 #include <new>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 
diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 1eba7cd6ae..2dde9da9a3 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/hiprec_convolve_test_util.h"
 
 using libaom_test::ACMRandom;
diff --git a/test/hiprec_convolve_test_util.h b/test/hiprec_convolve_test_util.h
index d0ce96ec5d..52a43e9da4 100644
--- a/test/hiprec_convolve_test_util.h
+++ b/test/hiprec_convolve_test_util.h
@@ -16,10 +16,10 @@
 
 #include "config/av1_rtcd.h"
 
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "aom_ports/aom_timer.h"
 #include "av1/common/convolve.h"
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 760461e3f0..bab3912a86 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 963df87c7d..fee0ccac94 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -13,7 +13,7 @@
 #include <ostream>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "av1/encoder/encoder.h"
 
diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index eb56b627d8..d5cc3d5a45 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"
 
diff --git a/test/intrabc_test.cc b/test/intrabc_test.cc
index 301cc6dd05..5293f01d27 100644
--- a/test/intrabc_test.cc
+++ b/test/intrabc_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 157ad26e8a..1371c7d79e 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -11,7 +11,7 @@
 
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index f4343b190d..9a052b4f16 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -13,7 +13,7 @@
 #include <ostream>
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/ivf_video_source.h"
 #include "test/util.h"
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 0ae3c64407..3d76873b02 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -17,7 +17,7 @@
 #include "aom/aom_encoder.h"
 #include "aom/aom_image.h"
 #include "aom/aomcx.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/level_test.cc b/test/level_test.cc
index 637e3e0ef5..dd8981c5d7 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -11,7 +11,7 @@
 #include <memory>
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/log2_test.cc b/test/log2_test.cc
index 3b1c979231..59b0423a3d 100644
--- a/test/log2_test.cc
+++ b/test/log2_test.cc
@@ -14,7 +14,7 @@
 
 #include "aom_ports/bitops.h"
 #include "av1/common/entropymode.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 TEST(Log2Test, GetMsb) {
   // Test small numbers exhaustively.
diff --git a/test/loopfilter_control_test.cc b/test/loopfilter_control_test.cc
index d120aaa6fe..04afa5bccc 100644
--- a/test/loopfilter_control_test.cc
+++ b/test/loopfilter_control_test.cc
@@ -13,7 +13,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index fff874ca86..5e126b6096 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 998ebb8962..edb107b29b 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 0d5986cdb8..8f825c6429 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -13,7 +13,7 @@
 #include <string.h>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 14d36d8b25..4d9ebe9ac4 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -14,7 +14,7 @@
 #include <string.h>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index e1538f6962..537909f96e 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "aom/aom_codec.h"
 #include "aom/aom_image.h"
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index dd88541d0c..b87ee9728d 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -13,7 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index 1989131095..c157275d46 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -11,7 +11,7 @@
 
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/motion_vector_test.cc b/test/motion_vector_test.cc
index 2d897caf7a..0709eeb6ea 100644
--- a/test/motion_vector_test.cc
+++ b/test/motion_vector_test.cc
@@ -11,7 +11,7 @@
 
 #include <memory>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/mv_cost_test.cc b/test/mv_cost_test.cc
index a0aed58495..d8ab5e05ef 100644
--- a/test/mv_cost_test.cc
+++ b/test/mv_cost_test.cc
@@ -11,7 +11,7 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 2ef5dc2897..9f05a44de9 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -17,8 +17,8 @@
 #include "aom_dsp/noise_model.h"
 #include "aom_dsp/noise_util.h"
 #include "config/aom_dsp_rtcd.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 0f01e90702..dd6484fa83 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/function_equivalence_test.h"
 #include "test/register_state_check.h"
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index a629d3c7dd..39b816e2e8 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 
 #include "test/function_equivalence_test.h"
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 3ab22e1706..fd25b0ccf0 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -11,7 +11,7 @@
 
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/register_state_check.h"
 #include "test/acm_random.h"
diff --git a/test/quant_test.cc b/test/quant_test.cc
index 40ca470e66..120eae7655 100644
--- a/test/quant_test.cc
+++ b/test/quant_test.cc
@@ -10,12 +10,12 @@
  */
 #include "config/aom_config.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "av1/encoder/av1_quantize.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
-#include "av1/encoder/av1_quantize.h"
 #include "test/y4m_video_source.h"
 
 namespace {
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index bebf72fc32..ac41714424 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -12,7 +12,7 @@
 #include <algorithm>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 66911152e6..31ae4509a4 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -13,11 +13,11 @@
 
 #include <memory>
 
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
-#include "test/util.h"
 #include "test/i420_video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/util.h"
 
 namespace {
 
diff --git a/test/ratectrl_test.cc b/test/ratectrl_test.cc
index 259f12300e..329ec89c85 100644
--- a/test/ratectrl_test.cc
+++ b/test/ratectrl_test.cc
@@ -12,7 +12,7 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/tpl_model.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/rd_test.cc b/test/rd_test.cc
index 8c9a54c476..58049ed85f 100644
--- a/test/rd_test.cc
+++ b/test/rd_test.cc
@@ -15,7 +15,7 @@
 #include "av1/common/quant_common.h"
 #include "av1/encoder/rd.h"
 #include "aom/aom_codec.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 772cb9b123..91bf7200ae 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -20,10 +20,10 @@
 #include "aom_ports/mem.h"
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 using libaom_test::ACMRandom;
diff --git a/test/register_state_check.h b/test/register_state_check.h
index b70fb503d3..20fe88ac89 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -12,7 +12,7 @@
 #ifndef AOM_TEST_REGISTER_STATE_CHECK_H_
 #define AOM_TEST_REGISTER_STATE_CHECK_H_
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 1682f9fd83..6eacb412ce 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -15,7 +15,7 @@
 #include "aom/aomcx.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/encoder.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index 2d07f0fd7b..1e9238c06f 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/sad_test.cc b/test/sad_test.cc
index cd29a6f1f0..fd10999f50 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -14,7 +14,7 @@
 #include <stdio.h>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index f70ebd2ec0..3982dafdfd 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc
@@ -12,7 +12,7 @@
 #include <initializer_list>
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
diff --git a/test/sb_qp_sweep_test.cc b/test/sb_qp_sweep_test.cc
index 5555d20e26..2e303b88e9 100644
--- a/test/sb_qp_sweep_test.cc
+++ b/test/sb_qp_sweep_test.cc
@@ -14,12 +14,12 @@
 #include <string>
 #include <vector>
 
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/yuv_video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/scalability_test.cc b/test/scalability_test.cc
index 6196b3e177..781823c02a 100644
--- a/test/scalability_test.cc
+++ b/test/scalability_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/scan_test.cc b/test/scan_test.cc
index 54a05ead53..17178ab93b 100644
--- a/test/scan_test.cc
+++ b/test/scan_test.cc
@@ -9,9 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
+#include "gtest/gtest.h"
 #include "test/av1_txfm_test.h"
 
 static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
diff --git a/test/screen_content_test.cc b/test/screen_content_test.cc
index 681843f692..837bdf70c8 100644
--- a/test/screen_content_test.cc
+++ b/test/screen_content_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_codec.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/y4m_video_source.h"
diff --git a/test/segment_binarization_sync.cc b/test/segment_binarization_sync.cc
index db866cfc65..2d4b6beeac 100644
--- a/test/segment_binarization_sync.cc
+++ b/test/segment_binarization_sync.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "av1/common/seg_common.h"
 #include "av1/decoder/decodemv.h"
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 9f793af573..33e992057c 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -12,7 +12,7 @@
 #include <ctime>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/av1_rtcd.h"
 
diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
index afd64450ea..d3815fd8b3 100644
--- a/test/sharpness_test.cc
+++ b/test/sharpness_test.cc
@@ -11,7 +11,7 @@
 
 #include <unordered_map>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
diff --git a/test/simd_impl.h b/test/simd_impl.h
index d055ebae02..f11c903b57 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -12,10 +12,10 @@
 #include <tuple>
 
 #define SIMD_CHECK 1
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/register_state_check.h"
 #include "aom_dsp/aom_simd_inline.h"
 #include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "gtest/gtest.h"
+#include "test/register_state_check.h"
 
 namespace SIMD_NAMESPACE {
 
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index 0dba47d452..54fbeaef8e 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/still_picture_test.cc b/test/still_picture_test.cc
index 2908a5d45a..7420b32f45 100644
--- a/test/still_picture_test.cc
+++ b/test/still_picture_test.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 1f454b6bf7..ca2b2a9b5c 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -12,7 +12,7 @@
 #include <cstdint>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index e2c455653d..f9174c1896 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -14,7 +14,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 633c279f6d..7da21608c1 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -12,7 +12,7 @@
 #include <climits>
 #include <vector>
 #include "config/aom_config.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/datarate_test.h"
 #include "test/encode_test_driver.h"
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 52e2366aa8..cda06c51a6 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -16,7 +16,7 @@
 #include <string>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/test_aom_rc.cc b/test/test_aom_rc.cc
index 2252443e1a..a4ea8f9574 100644
--- a/test/test_aom_rc.cc
+++ b/test/test_aom_rc.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index b5dbadd391..b4849edcba 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -14,7 +14,7 @@
 #include <stdio.h>
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/test/test_libaom.cc b/test/test_libaom.cc
index 2a29b45ca0..b8a91e06c6 100644
--- a/test/test_libaom.cc
+++ b/test/test_libaom.cc
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index fd9300200f..b521e932c4 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -15,9 +15,9 @@
 #include <set>
 #include <string>
 #include <tuple>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "common/tools_common.h"
 #include "config/aom_config.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/ivf_video_source.h"
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index 4bd4a67b4f..7604f237df 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc
@@ -11,7 +11,7 @@
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/y4m_video_source.h"
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index fcfcbce783..a2ea5c3d40 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -12,7 +12,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <string>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index 17320415a2..c3ef1eceae 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc
@@ -11,11 +11,11 @@
 
 //  Test AOM timestamp handling
 
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/test/tpl_model_test.cc b/test/tpl_model_test.cc
index 4f15fb571d..3657009b9c 100644
--- a/test/tpl_model_test.cc
+++ b/test/tpl_model_test.cc
@@ -17,7 +17,7 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/encoder.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 92385c3e68..4c037da855 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -12,7 +12,7 @@
 #ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
 #define AOM_TEST_TRANSFORM_TEST_BASE_H_
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/txfm_common.h"
diff --git a/test/util.h b/test/util.h
index e043d87097..c60e961404 100644
--- a/test/util.h
+++ b/test/util.h
@@ -15,7 +15,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "aom/aom_image.h"
 #include "aom_ports/aom_timer.h"
 
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 283c174308..6f98ae4adf 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -14,7 +14,7 @@
 #include <ostream>
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/test/video_source.h b/test/video_source.h
index da51f6a2c4..d537d55964 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -27,7 +27,7 @@
 #include "aom/aom_encoder.h"
 #include "test/acm_random.h"
 #if !defined(_WIN32)
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #endif
 
 namespace libaom_test {
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index bade6799b9..3a35075bbe 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -10,7 +10,7 @@
  */
 #include <tuple>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/warp_filter_test_util.h"
 using libaom_test::ACMRandom;
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index b2f370e614..7e401d8348 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -17,7 +17,7 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
diff --git a/test/webmenc_test.cc b/test/webmenc_test.cc
index eba0760ee2..d6245f5d98 100644
--- a/test/webmenc_test.cc
+++ b/test/webmenc_test.cc
@@ -11,7 +11,7 @@
 
 #include <string>
 #include "common/webmenc.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 namespace {
 
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index d0bd045e3d..2cac268d91 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -13,7 +13,7 @@
 #include <utility>
 #include <vector>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/register_state_check.h"
 #include "test/acm_random.h"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index beec69c589..a96ab1261e 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -14,10 +14,10 @@
 #include "config/aom_config.h"
 
 #include "common/y4menc.h"
+#include "gtest/gtest.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
-- 
GitLab


From 2f5dfe826dd70f7aceda04afe163ff21238d697a Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 31 Jul 2024 14:08:23 +0100
Subject: [PATCH 304/391] Refactor Arm Neon I8MM implementation of
 convolve_x_sr

Move the 8-tap filter path into its own inline function and factor
out some common constants. This is a preparatory change for adding
a 6-tap path accelerated by I8MM matrix multiply instructions.

Change-Id: Id1fa8056072582230e1e9a5fe4a8fc5861204daa
---
 av1/common/arm/convolve_neon_i8mm.c | 165 ++++++++++++++--------------
 1 file changed, 82 insertions(+), 83 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 8d7eb51772..f89545a509 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -149,20 +149,71 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
   }
 }
 
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+                                      const uint8x16x3_t permute_tbl,
+                                      const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
+
+  int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
+
+  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_x_sr_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
+    const int32x4_t horiz_const) {
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int w = width;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
 static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
                                       const int8x8_t filters,
-                                      const uint8x16_t permute_tbl) {
+                                      const uint8x16_t permute_tbl,
+                                      const int32x4_t horiz_const) {
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  // Dot product constants:
-  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we halved the filter values.
-  int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
-  int32x4_t sum = vusdotq_lane_s32(acc, perm_samples, filters, 0);
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0);
 
   // Further narrowing and packing is performed by the caller.
   return vmovn_s32(sum);
@@ -170,20 +221,15 @@ static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
 
 static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
                                       const int8x8_t filters,
-                                      const uint8x16x2_t permute_tbl) {
+                                      const uint8x16x2_t permute_tbl,
+                                      const int32x4_t horiz_const) {
   // Permute samples ready for dot product.
   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
   uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
                                  vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
-  // Dot product constants:
-  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS. Halve the total because we halved the filter values.
-  int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2);
-
+  int32x4_t acc = horiz_const;
   int32x4_t sum0123 = vusdotq_lane_s32(acc, perm_samples[0], filters, 0);
   int32x4_t sum4567 = vusdotq_lane_s32(acc, perm_samples[1], filters, 0);
 
@@ -195,7 +241,8 @@ static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
 
 static INLINE void convolve_x_sr_4tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
+    const int32x4_t horiz_const) {
   const int16x4_t x_filter = vld1_s16(filter_x + 2);
   // All 4-tap and bilinear filter values are even, so halve them to reduce
   // intermediate precision requirements.
@@ -207,10 +254,10 @@ static INLINE void convolve_x_sr_4tap_neon_i8mm(
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t t0 = convolve4_4_x(s0, filter, perm_tbl);
-      int16x4_t t1 = convolve4_4_x(s1, filter, perm_tbl);
-      int16x4_t t2 = convolve4_4_x(s2, filter, perm_tbl);
-      int16x4_t t3 = convolve4_4_x(s3, filter, perm_tbl);
+      int16x4_t t0 = convolve4_4_x(s0, filter, perm_tbl, horiz_const);
+      int16x4_t t1 = convolve4_4_x(s1, filter, perm_tbl, horiz_const);
+      int16x4_t t2 = convolve4_4_x(s2, filter, perm_tbl, horiz_const);
+      int16x4_t t3 = convolve4_4_x(s3, filter, perm_tbl, horiz_const);
       // We halved the filter values so -1 from right shift.
       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
@@ -233,10 +280,10 @@ static INLINE void convolve_x_sr_4tap_neon_i8mm(
         uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        uint8x8_t d0 = convolve4_8_x(s0, filter, perm_tbl);
-        uint8x8_t d1 = convolve4_8_x(s1, filter, perm_tbl);
-        uint8x8_t d2 = convolve4_8_x(s2, filter, perm_tbl);
-        uint8x8_t d3 = convolve4_8_x(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve4_8_x(s0, filter, perm_tbl, horiz_const);
+        uint8x8_t d1 = convolve4_8_x(s1, filter, perm_tbl, horiz_const);
+        uint8x8_t d2 = convolve4_8_x(s2, filter, perm_tbl, horiz_const);
+        uint8x8_t d3 = convolve4_8_x(s3, filter, perm_tbl, horiz_const);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -251,28 +298,6 @@ static INLINE void convolve_x_sr_4tap_neon_i8mm(
   }
 }
 
-static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
-                                      const uint8x16x3_t permute_tbl,
-                                      const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
-
-  int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0);
-  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
-
-  int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0);
-  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
-
-  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
-  // We halved the convolution filter values so - 1 from the right shift.
-  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
-}
-
 void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams *filter_params_x,
@@ -298,47 +323,21 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
     return;
   }
 
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
+  // convolution kernels: Adding this shim enables us to use a single rounding
+  // right shift by FILTER_BITS instead of two rounding right shifts: first by
+  // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
+  // Halve the total because we will halve the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2));
+
   if (filter_taps <= 4) {
     convolve_x_sr_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, w, h,
-                                 x_filter_ptr);
+                                 x_filter_ptr, horiz_const);
     return;
   }
 
-  // Filter values are even, so halve to reduce intermediate precision reqs.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
-
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-  do {
-    const uint8_t *s = src;
-    uint8_t *d = dst;
-    int width = w;
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
-
-      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-    src += 4 * src_stride;
-    dst += 4 * dst_stride;
-    h -= 4;
-  } while (h != 0);
+  convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                               x_filter_ptr, horiz_const);
 }
 
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
-- 
GitLab


From 44bded946d17a1b41acbee71de29809125f7c694 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 31 Jul 2024 14:46:40 +0100
Subject: [PATCH 305/391] Add Arm Neon USMMLA implementation of 6-tap
 convolve_x_sr

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

Change-Id: Ibf43327226f5066c5e67d70a3d44a219f492f7a2
---
 av1/common/arm/convolve_neon_i8mm.c | 83 ++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 6 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index f89545a509..d73891e666 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -31,6 +31,13 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
+DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
+  // clang-format on
+};
+
 static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl,
@@ -205,6 +212,64 @@ static INLINE void convolve_x_sr_8tap_neon_i8mm(
   } while (height != 0);
 }
 
+static INLINE uint8x8_t convolve6_8_x(uint8x16_t samples,
+                                      const int8x16_t filter,
+                                      const uint8x16x2_t permute_tbl,
+                                      const int32x4_t horiz_const) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
+  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
+
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_x_sr_6tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
+    const int32x4_t horiz_const) {
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int w = width;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const);
+      uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
 static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16_t permute_tbl,
@@ -317,12 +382,6 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
 
   int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
-  if (filter_taps > 8) {
-    convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
-                                  x_filter_ptr);
-    return;
-  }
-
   // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
   // convolution kernels: Adding this shim enables us to use a single rounding
   // right shift by FILTER_BITS instead of two rounding right shifts: first by
@@ -330,6 +389,18 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
   // Halve the total because we will halve the filter values.
   const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2));
 
+  if (filter_taps == 6) {
+    convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h,
+                                 x_filter_ptr, horiz_const);
+    return;
+  }
+
+  if (filter_taps > 8) {
+    convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                  x_filter_ptr);
+    return;
+  }
+
   if (filter_taps <= 4) {
     convolve_x_sr_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, w, h,
                                  x_filter_ptr, horiz_const);
-- 
GitLab


From fd68c87daa85ac3f9e82537dead16b4158318311 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 31 Jul 2024 18:15:44 +0100
Subject: [PATCH 306/391] Add Arm Neon USMMLA implementation of 12-tap
 convolve_x_sr

Split the 12-tap filter into two 6-tap filters to enable a
convolution kernel implementation using the Armv8.6 USMMLA matrix
multiply instructions. These 2x8 by 8x2 matrix multiply instructions
do twice the work of a USDOT dot product instruction.

Change-Id: Idc8648443fe5e27b78cacfbbd457b52754f1e53f
---
 av1/common/arm/convolve_neon_i8mm.c | 98 ++++++++++++++++-------------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index d73891e666..464c190780 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -38,48 +38,44 @@ DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
   // clang-format on
 };
 
-static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
-                                       const int8x16_t filter,
-                                       const uint8x16x3_t permute_tbl,
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples[2],
+                                       const int8x16_t filter[2],
+                                       const uint8x16_t permute_tbl,
                                        const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
+  // Permute samples ready for matrix multiply.
+  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl),
+                                 vqtbl1q_u8(samples[1], permute_tbl) };
 
-  int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0);
-  sum = vusdotq_laneq_s32(sum, perm_samples[1], filter, 1);
-  sum = vusdotq_laneq_s32(sum, perm_samples[2], filter, 2);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
+  sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]);
 
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
-                                       const int8x16_t filter,
-                                       const uint8x16x3_t permute_tbl,
+                                       const int8x16_t filter[2],
+                                       const uint8x16x2_t permute_tbl,
                                        const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  // Permute samples ready for matrix multiply.
+  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  // {  6,  7,  8,  9, 10, 11, 12, 13,  8,  9, 10, 11, 12, 13, 14, 15 }
+  // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 }
   uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
                                  vqtbl1q_u8(samples[0], permute_tbl.val[1]),
-                                 vqtbl1q_u8(samples[0], permute_tbl.val[2]),
-                                 vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
-
-  int32x4_t sum0123 =
-      vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0);
-  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
-  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[1]) };
 
-  int32x4_t sum4567 =
-      vusdotq_laneq_s32(horiz_const, perm_samples[1], filter, 0);
-  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
-  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
+  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]);
+  sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
+  sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
 
   // Narrow and re-pack.
   int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
@@ -94,23 +90,33 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
   // The no-op filter should never be used here.
   assert(x_filter_ptr[5] != 128);
 
-  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
-  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
-  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
-  const int8x16_t filter =
-      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
+  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
+  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
+  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask);
+  const int8x8_t filter_1 =
+      vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2);
 
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS.
+  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t filter[2] = {
+    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
+    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
+  };
+
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
+  // convolution kernels: Adding this shim enables us to use a single rounding
+  // right shift by FILTER_BITS instead of two rounding right shifts: first by
+  // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
   const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
 
   if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
+
     do {
-      uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(src, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(src + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
       int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
       int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
@@ -128,6 +134,8 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+
     do {
       const uint8_t *s = src;
       uint8_t *d = dst;
@@ -136,14 +144,14 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
       do {
         uint8x16_t s0[2], s1[2], s2[2], s3[2];
         load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+        load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
         uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
         uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
         uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
         uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
 
-        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
-- 
GitLab


From d8ef6563e0ac6c1e29b0013dec7b6ce6f302e54d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 13:08:46 -0700
Subject: [PATCH 307/391] Mark aom_gtest include dirs as system include dirs

On Linux, the effect is this change is that -isystem instead of -I is
used with the gtest include dir.

Before:
  -I/home/wtc/aom.3/aom/third_party/googletest/src/googletest/include

After:
  -isystem /home/wtc/aom.3/aom/third_party/googletest/src/googletest/include

Bug: aomedia:356830476
Change-Id: I17682c27e1135083f5ba56b46023f744e3fe00c0
---
 test/test.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/test.cmake b/test/test.cmake
index f697db1c9c..68caf0c22c 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -405,8 +405,11 @@ if(ENABLE_TESTS)
     aom_gtest STATIC
     "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
   set_property(TARGET aom_gtest PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+  # There are -Wundef warnings in the gtest headers. Tell the compiler to treat
+  # the gtest include directories as system include directories and suppress
+  # compiler warnings in the gtest headers.
   target_include_directories(
-    aom_gtest
+    aom_gtest SYSTEM
     PUBLIC "${AOM_ROOT}/third_party/googletest/src/googletest/include"
     PRIVATE "${AOM_ROOT}/third_party/googletest/src/googletest")
 
-- 
GitLab


From 41a954d9872c13b8d499429480f78c1aadd11294 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Fri, 2 Aug 2024 07:47:12 -0700
Subject: [PATCH 308/391] Fix -Wundef warnings in libyuv's row_common.cc

See https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5758156.

Bug: aomedia:356830476
Change-Id: I402bce296930bdb7fb6994cdf4270353e7aeb301
---
 third_party/libyuv/README.libaom        | 85 +++++++++++++++++++++++++
 third_party/libyuv/source/row_common.cc | 18 +++---
 2 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/third_party/libyuv/README.libaom b/third_party/libyuv/README.libaom
index 6e66f858e2..fcdf921c57 100644
--- a/third_party/libyuv/README.libaom
+++ b/third_party/libyuv/README.libaom
@@ -35,3 +35,88 @@ index fe89452b7..72a7fb82f 100644
 +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
  #pragma optimize("g", on)
  #endif
+diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
+index 79aed5c787..b0add63a6e 100644
+--- a/third_party/libyuv/source/row_common.cc
++++ b/third_party/libyuv/source/row_common.cc
+@@ -37,7 +37,7 @@ extern "C" {
+ // llvm x86 is poor at ternary operator, so use branchless min/max.
+
+ #define USE_BRANCHLESS 1
+-#if USE_BRANCHLESS
++#if defined(USE_BRANCHLESS)
+ static __inline int32_t clamp0(int32_t v) {
+   return -(v >= 0) & v;
+ }
+@@ -460,7 +460,7 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+
+ // ARGBToY_C and ARGBToUV_C
+ // Intel version mimic SSE/AVX which does 2 pavgb
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+
+ #define MAKEROWY(NAME, R, G, B, BPP)                                         \
+   void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+@@ -602,7 +602,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+
+ // ARGBToYJ_C and ARGBToUVJ_C
+ // Intel version mimic SSE/AVX which does 2 pavgb
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+ #define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+   void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+     int x;                                                                    \
+@@ -766,7 +766,7 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+     g3 = (g3 << 2) | (g3 >> 4);
+     r3 = (r3 << 3) | (r3 >> 2);
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+@@ -800,7 +800,7 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+     g2 = (g2 << 2) | (g2 >> 4);
+     r2 = (r2 << 3) | (r2 >> 2);
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(b0, b2);
+     uint8_t ag = AVGB(g0, g2);
+     uint8_t ar = AVGB(r0, r2);
+@@ -850,7 +850,7 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+     g3 = (g3 << 3) | (g3 >> 2);
+     r3 = (r3 << 3) | (r3 >> 2);
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+@@ -884,7 +884,7 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+     g2 = (g2 << 3) | (g2 >> 2);
+     r2 = (r2 << 3) | (r2 >> 2);
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(b0, b2);
+     uint8_t ag = AVGB(g0, g2);
+     uint8_t ar = AVGB(r0, r2);
+@@ -934,7 +934,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+     g3 = (g3 << 4) | g3;
+     r3 = (r3 << 4) | r3;
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+@@ -968,7 +968,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+     g2 = (g2 << 4) | g2;
+     r2 = (r2 << 4) | r2;
+
+-#if LIBYUV_ARGBTOUV_PAVGB
++#if defined(LIBYUV_ARGBTOUV_PAVGB)
+     uint8_t ab = AVGB(b0, b2);
+     uint8_t ag = AVGB(g0, g2);
+     uint8_t ar = AVGB(r0, r2);
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
index 79aed5c787..b0add63a6e 100644
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -37,7 +37,7 @@ extern "C" {
 // llvm x86 is poor at ternary operator, so use branchless min/max.
 
 #define USE_BRANCHLESS 1
-#if USE_BRANCHLESS
+#if defined(USE_BRANCHLESS)
 static __inline int32_t clamp0(int32_t v) {
   return -(v >= 0) & v;
 }
@@ -460,7 +460,7 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
 
 // ARGBToY_C and ARGBToUV_C
 // Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
 
 #define MAKEROWY(NAME, R, G, B, BPP)                                         \
   void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
@@ -602,7 +602,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
 
 // ARGBToYJ_C and ARGBToUVJ_C
 // Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
 #define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
   void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
     int x;                                                                    \
@@ -766,7 +766,7 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     g3 = (g3 << 2) | (g3 >> 4);
     r3 = (r3 << 3) | (r3 >> 2);
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
@@ -800,7 +800,7 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     g2 = (g2 << 2) | (g2 >> 4);
     r2 = (r2 << 3) | (r2 >> 2);
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(b0, b2);
     uint8_t ag = AVGB(g0, g2);
     uint8_t ar = AVGB(r0, r2);
@@ -850,7 +850,7 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     g3 = (g3 << 3) | (g3 >> 2);
     r3 = (r3 << 3) | (r3 >> 2);
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
@@ -884,7 +884,7 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     g2 = (g2 << 3) | (g2 >> 2);
     r2 = (r2 << 3) | (r2 >> 2);
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(b0, b2);
     uint8_t ag = AVGB(g0, g2);
     uint8_t ar = AVGB(r0, r2);
@@ -934,7 +934,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     g3 = (g3 << 4) | g3;
     r3 = (r3 << 4) | r3;
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
@@ -968,7 +968,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     g2 = (g2 << 4) | g2;
     r2 = (r2 << 4) | r2;
 
-#if LIBYUV_ARGBTOUV_PAVGB
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
     uint8_t ab = AVGB(b0, b2);
     uint8_t ag = AVGB(g0, g2);
     uint8_t ar = AVGB(r0, r2);
-- 
GitLab


From fd9161960d60fc6ef7c69ce52f76941bb0ebc36a Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 1 Aug 2024 15:59:09 -0700
Subject: [PATCH 309/391] Compile C++ files with -Wundef

Bug: aomedia:356830476
Change-Id: I044beb802362dde359b628ccd8891ca621c04f4b
---
 build/cmake/aom_configure.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 20859469c7..e204eda656 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -352,6 +352,7 @@ else()
   add_compiler_flag_if_supported("-Wsign-compare")
   add_compiler_flag_if_supported("-Wstring-conversion")
   add_compiler_flag_if_supported("-Wtype-limits")
+  add_compiler_flag_if_supported("-Wundef")
   add_compiler_flag_if_supported("-Wuninitialized")
   add_compiler_flag_if_supported("-Wunreachable-code-aggressive")
   add_compiler_flag_if_supported("-Wunused")
@@ -379,9 +380,6 @@ else()
     add_compiler_flag_if_supported("-Wno-disabled-optimization")
   endif()
 
-  # Add -Wundef only for C files to avoid massive gtest warning spam.
-  add_c_flag_if_supported("-Wundef")
-
   # Quiet gcc 6 vs 7 abi warnings:
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
   if(AOM_TARGET_CPU MATCHES "arm")
-- 
GitLab


From a62c1833e0e0683069b1924f4e2efc11c94aad0f Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Fri, 2 Aug 2024 09:06:28 +0100
Subject: [PATCH 310/391] Add Arm Neon USMMLA impl. for horiz. 12-tap
 convolve_2d_sr

Split the 12-tap filter into two 6-tap filters to enable a
convolution kernel implementation using the Armv8.6 USMMLA matrix
multiply instructions. These 2x8 by 8x2 matrix multiply instructions
do twice the work of a USDOT dot product instruction.

Change-Id: Ibdc612f2543b8c3ca93c1fd2c23cb84285e233e8
---
 av1/common/arm/convolve_neon_i8mm.c |  12 +--
 av1/common/arm/convolve_neon_i8mm.h | 136 +++++++++++++++-------------
 av1/common/arm/convolve_sve2.c      |   5 +-
 3 files changed, 76 insertions(+), 77 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 464c190780..796d3f709d 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -31,13 +31,6 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
-  // clang-format off
-  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
-  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
-  // clang-format on
-};
-
 static INLINE int16x4_t convolve12_4_x(uint8x16_t samples[2],
                                        const int8x16_t filter[2],
                                        const uint8x16_t permute_tbl,
@@ -1289,14 +1282,11 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
 
-    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
-    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
     const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
     const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
 
     convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
-                                         im_stride, w, im_h, x_filter_0_7,
-                                         x_filter_8_11);
+                                         im_stride, w, im_h, x_filter_ptr);
 
     convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                    y_filter_0_7, y_filter_8_11);
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h
index ddd8364ea5..fcbdd2bf5c 100644
--- a/av1/common/arm/convolve_neon_i8mm.h
+++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -29,49 +29,52 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
-                                          const int8x16_t filters,
-                                          const uint8x16x3_t permute_tbl,
+DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
+  // clang-format on
+};
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples[2],
+                                          const int8x16_t filter[2],
+                                          const uint8x16_t permute_tbl,
                                           int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
-
-  int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
-  sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1);
-  sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2);
+  // Permute samples ready for matrix multiply.
+  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl),
+                                 vqtbl1q_u8(samples[1], permute_tbl) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
+  sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]);
 
   // Narrow and re-pack.
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
 static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
-                                          const int8x16_t filters,
-                                          const uint8x16x3_t permute_tbl,
+                                          const int8x16_t filter[2],
+                                          const uint8x16x2_t permute_tbl,
                                           const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  /// Permute samples ready for matrix multiply.
+  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  // {  6,  7,  8,  9, 10, 11, 12, 13,  8,  9, 10, 11, 12, 13, 14, 15 }
+  // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 }
   uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
                                  vqtbl1q_u8(samples[0], permute_tbl.val[1]),
-                                 vqtbl1q_u8(samples[0], permute_tbl.val[2]),
-                                 vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
-
-  int32x4_t sum0123 =
-      vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
-  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
-  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[1]) };
 
-  int32x4_t sum4567 =
-      vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0);
-  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
-  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
+  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]);
+  sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
+  sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
 
   // Narrow and re-pack.
   return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
@@ -80,34 +83,44 @@ static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
 
 static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11) {
+    const int dst_stride, int w, int h, const int16_t *x_filter_ptr) {
   // The no-op filter should never be used here.
-  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+  assert(x_filter_ptr[5] != 128);
 
   const int bd = 8;
 
-  // Narrow filter values to 8-bit.
-  const int16x8x2_t x_filter_s16 = {
-    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
+  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
+  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
+  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask);
+  const int8x8_t filter_1 =
+      vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2);
+
+  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t filter[2] = {
+    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
+    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
   };
-  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                         vmovn_s16(x_filter_s16.val[1]));
+
   // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-  // - which are generally faster than rounding shifts on modern CPUs.
+  // in convolution kernels - which are generally faster than rounding shifts on
+  // modern CPUs.
   const int32x4_t horiz_const =
       vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
   if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
+
     do {
-      uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(src_ptr, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(src_ptr + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
-      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+      int16x4_t d0 = convolve12_4_2d_h(s0, filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_2d_h(s1, filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_2d_h(s2, filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_2d_h(s3, filter, permute_tbl, horiz_const);
 
       store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
 
@@ -117,8 +130,10 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
     } while (h > 4);
 
     do {
-      uint8x16_t s0 = vld1q_u8(src_ptr);
-      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      uint8x16_t s0[2];
+      s0[0] = vld1q_u8(src_ptr);
+      s0[1] = vld1q_u8(src_ptr + 6);
+      int16x4_t d0 = convolve12_4_2d_h(s0, filter, permute_tbl, horiz_const);
       vst1_s16(dst_ptr, d0);
 
       src_ptr += src_stride;
@@ -126,6 +141,8 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
     } while (--h != 0);
 
   } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+
     do {
       const uint8_t *s = src_ptr;
       int16_t *d = dst_ptr;
@@ -134,16 +151,12 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
       do {
         uint8x16_t s0[2], s1[2], s2[2], s3[2];
         load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+        load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
 
-        int16x8_t d0 =
-            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x8_t d1 =
-            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x8_t d2 =
-            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x8_t d3 =
-            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+        int16x8_t d0 = convolve12_8_2d_h(s0, filter, permute_tbl, horiz_const);
+        int16x8_t d1 = convolve12_8_2d_h(s1, filter, permute_tbl, horiz_const);
+        int16x8_t d2 = convolve12_8_2d_h(s2, filter, permute_tbl, horiz_const);
+        int16x8_t d3 = convolve12_8_2d_h(s3, filter, permute_tbl, horiz_const);
 
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -165,9 +178,8 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
       do {
         uint8x16_t s0[2];
         s0[0] = vld1q_u8(s);
-        s0[1] = vld1q_u8(s + 4);
-        int16x8_t d0 =
-            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        s0[1] = vld1q_u8(s + 6);
+        int16x8_t d0 = convolve12_8_2d_h(s0, filter, permute_tbl, horiz_const);
         vst1q_s16(d, d0);
 
         s += 8;
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
index 832a3b4e5e..8875f86581 100644
--- a/av1/common/arm/convolve_sve2.c
+++ b/av1/common/arm/convolve_sve2.c
@@ -184,14 +184,11 @@ void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
 
-    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
-    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
     const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
     const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
 
     convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
-                                         im_stride, w, im_h, x_filter_0_7,
-                                         x_filter_8_11);
+                                         im_stride, w, im_h, x_filter_ptr);
 
     convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h,
                                    y_filter_0_7, y_filter_4_11);
-- 
GitLab


From 90defa923d158539c837810db0f1fa39e2838fd2 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 2 Aug 2024 14:13:41 +0100
Subject: [PATCH 311/391] Fix accumulator value for Neon implementation of
 compute_stats

Fix the value of the maximum number of pixels that can be computed in a
signed 32-bit integer. That value assumed accumulating in an unsigned
32-bit integer, which is wrong and could trigger an integer overflow for
10-bit and 12-bit.

This was not detected by the unit tests, but they will be amended in a
subsequent commit.

Change-Id: I0214792293ad03e3aa0d331c8320f416e5df8d81
---
 av1/encoder/arm/highbd_pickrst_neon.c | 16 ++++++++--------
 av1/encoder/arm/pickrst_neon.c        |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index 60beca2dc0..cfc5e0c7e8 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -454,10 +454,10 @@ static INLINE void highbd_compute_stats_win7_neon(
   const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
   const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
 
-  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
-  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
-  // be as high as 32768/2048/128 for the compute stats.
-  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+  // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+  // accumulator max can be as high as 16384/1024/64 for the compute stats.
+  const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
   int acc_cnt = acc_cnt_max;
   const int src_next = src_stride - width;
   const int dgd_next = dgd_stride - width;
@@ -736,10 +736,10 @@ static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
   const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
   const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
 
-  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
-  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
-  // be as high as 32768/2048/128 for the compute stats.
-  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+  // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+  // accumulator max can be as high as 16384/1024/64 for the compute stats.
+  const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
   int acc_cnt = acc_cnt_max;
   const int src_next = src_stride - width;
   const int dgd_next = dgd_stride - width;
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index 015378ac98..b6fba99d8d 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -176,10 +176,10 @@ int64_t av1_lowbd_pixel_proj_error_neon(
   return sse;
 }
 
-// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
-// processing 2 pixels at a time, so the accumulator max can be as high as 32768
-// for the compute stats.
-#define STAT_ACCUMULATOR_MAX 32768
+// We can accumulate up to 32768 8-bit multiplication results in a signed
+// 32-bit integer. We are processing 2 pixels at a time, so the accumulator max
+// can be as high as 16384 for the compute stats.
+#define STAT_ACCUMULATOR_MAX 16384
 
 static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
 #if AOM_ARCH_AARCH64
-- 
GitLab


From aae62553873fd3ff8c77f8e1dc7a698435a439b6 Mon Sep 17 00:00:00 2001
From: Salome Thirot <salome.thirot@arm.com>
Date: Fri, 2 Aug 2024 14:17:23 +0100
Subject: [PATCH 312/391] Add overflow check in HBD compute_stats unit tests

The current test cases don't make values for M and H go beyond the
unsigned 32-bit limit, which led to an overflow issue in the Neon
implementation going undetected for 10-bit and 12-bit. Add a test case
in the high bitdepth unit tests with the input buffers filled such that
the values of M and H will at some point overflow if kept in an unsigned
32-bit variable.

Change-Id: I99b6350e452898e7f6a42ffc2b8b3fa9ea70c2db
---
 test/wiener_test.cc | 73 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 2cac268d91..d3995b901f 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -572,6 +572,9 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
   void RunWienerTest_ExtremeValues(const int32_t wiener_win,
                                    aom_bit_depth_t bit_depth);
 
+  void RunWienerTest_Overflow32bTest(const int32_t wiener_win,
+                                     aom_bit_depth_t bit_depth);
+
  private:
   compute_stats_Func target_func_;
   libaom_test::ACMRandom rng_;
@@ -722,6 +725,68 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
   }
 }
 
+void WienerTestHighbd::RunWienerTest_Overflow32bTest(
+    const int32_t wiener_win, aom_bit_depth_t bit_depth) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = 16;
+  const int h_end = MAX_WIENER_BLOCK;
+  const int v_start = 16;
+  const int v_end = MAX_WIENER_BLOCK;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = 1;
+  int16_t *dgd_avg = buf;
+  int16_t *src_avg =
+      buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    // Fill src and dgd such that the intermediate values for M and H will at
+    // some point overflow a signed 32-bit value.
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = ((uint16_t)1 << bit_depth) - 1;
+      src_buf[i] = 0;
+    }
+
+    memset(dgd_buf, 0, MAX_DATA_BLOCK * 30 * sizeof(dgd_buf));
+    const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
+        dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
+    const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
+
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+                               h_start, h_end, v_start, v_end, dgd_stride,
+                               src_stride, M_ref, H_ref, bit_depth);
+
+    target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end,
+                 v_start, v_end, dgd_stride, src_stride, M_test, H_test,
+                 bit_depth);
+
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (M_ref[i] != M_test[i]) {
+        failed = 1;
+        printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (H_ref[i] != H_test[i]) {
+        failed = 1;
+        printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
 TEST_P(WienerTestHighbd, RandomValues) {
   RunWienerTest(WIENER_WIN, 1, AOM_BITS_8);
   RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_8);
@@ -740,6 +805,14 @@ TEST_P(WienerTestHighbd, ExtremeValues) {
   RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_12);
 }
 
+TEST_P(WienerTestHighbd, Overflow32bTest) {
+  RunWienerTest_Overflow32bTest(WIENER_WIN, AOM_BITS_8);
+  RunWienerTest_Overflow32bTest(WIENER_WIN_CHROMA, AOM_BITS_8);
+  RunWienerTest_Overflow32bTest(WIENER_WIN, AOM_BITS_10);
+  RunWienerTest_Overflow32bTest(WIENER_WIN_CHROMA, AOM_BITS_10);
+  RunWienerTest_Overflow32bTest(WIENER_WIN, AOM_BITS_12);
+  RunWienerTest_Overflow32bTest(WIENER_WIN_CHROMA, AOM_BITS_12);
+}
 TEST_P(WienerTestHighbd, DISABLED_Speed) {
   RunWienerTest(WIENER_WIN, 200, AOM_BITS_8);
   RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_8);
-- 
GitLab


From acc1e5929ddb8ba070116373bd992ede1d79cb97 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 1 Aug 2024 14:01:19 -0700
Subject: [PATCH 313/391] update AV1E_SET_POSTENCODE_DROP_RTC comment

Add a link to rc_dropframe_thresh and break postencode into two words.

Change-Id: Ie208329342664ae442bfa9b72ca56f2790292074
---
 aom/aomcx.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 9466757ee1..3f3eb643aa 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1554,10 +1554,12 @@ enum aome_enc_control_id {
    */
   AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC = 167,
 
-  /*!\brief Codec control to enable postencode frame drop for RTC encoding,
-   * int parameter. Value of 1 means encoder will enable postencode
-   * drop, Default is 0 (not enabled). Postencode drop is only allowed
-   * when frame dropping is enabled (rc_dropframe_thresh > 0).
+  /*!\brief Codec control to enable post encode frame drop for RTC encoding,
+   * int parameter.
+   *
+   * Value of 1 means encoder will enable post encode drop. Default is 0 (not
+   * enabled). Post encode drop is only allowed when frame dropping is enabled
+   * (aom_codec_enc_cfg::rc_dropframe_thresh > 0).
    */
   AV1E_SET_POSTENCODE_DROP_RTC = 168,
 
-- 
GitLab


From 88e4df06ca24c73b79c84a15fc230074062a5584 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Fri, 2 Aug 2024 12:13:34 +0100
Subject: [PATCH 314/391] Use Arm Neon USMMLA for horiz. 6-tap path for
 convolve_2d_sr

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

Change-Id: I0f6b3969925a4d7190e277c3cd0221f7f4c98018
---
 av1/common/arm/convolve_neon_i8mm.c | 58 +++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 796d3f709d..c0957aa29e 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1054,6 +1054,27 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
   }
 }
 
+static INLINE int16x8_t convolve6_8_2d_h(uint8x16_t samples,
+                                         const int8x16_t filter,
+                                         const uint8x16x2_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
+  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+}
+
 static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
                                                  int src_stride, uint8_t *dst,
                                                  int dst_stride, int w, int h,
@@ -1061,16 +1082,21 @@ static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
                                                  const int16_t *y_filter_ptr) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   // Filter values are even, so halve to reduce intermediate precision reqs.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
   const int bd = 8;
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // The outermost -1 is needed because we halved the filter values.
+  // shifts in convolution kernels - which are generally faster than rounding
+  // shifts on modern CPUs. The outermost -1 is needed because we halved the
+  // filter values.
   const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
                                             (1 << ((ROUND0_BITS - 1) - 1)));
   const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
 
   do {
     const uint8_t *s = src;
@@ -1081,24 +1107,24 @@ static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
     load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
     s += 5 * src_stride;
 
-    int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
-    int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
-    int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
-    int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
-    int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+    int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
 
     do {
       uint8x16_t h_s5, h_s6, h_s7, h_s8;
       load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
 
       int16x8_t v_s5 =
-          convolve8_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
       int16x8_t v_s6 =
-          convolve8_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
       int16x8_t v_s7 =
-          convolve8_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const);
       int16x8_t v_s8 =
-          convolve8_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const);
 
       uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
                                       y_filter, vert_const);
@@ -1294,9 +1320,9 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
     DECLARE_ALIGNED(16, int16_t,
                     im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-    if (y_filter_taps == 6 && x_filter_taps >= 6) {
-      convolve_2d_sr_6tap_neon_i8mm(src_ptr, src_stride, dst, dst_stride, w, h,
-                                    x_filter_ptr, y_filter_ptr);
+    if (x_filter_taps == 6 && y_filter_taps == 6) {
+      convolve_2d_sr_6tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w,
+                                    h, x_filter_ptr, y_filter_ptr);
       return;
     }
 
-- 
GitLab


From a1e3c8c721c7523f12b57307d9b5fed4b35ae221 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 5 Aug 2024 09:24:24 +0100
Subject: [PATCH 315/391] Refactor 4-tap convolve_2d_sr Neon I8MM path

A 6-tap USMMLA (horizontal) convolution kernel requires the same
number of instructions as a 4-tap USDOT convolution kernel; therefore
we can use the USMMLA 6-tap path for both 6- and 4-tap cases.

This patch uses the above information to expand the utility of the
Neon I8MM 4-tap combined 2D convolution to support up to 6-tap
horizontal filters. This is useful because 6-tap horiz, 4-tap vert
filter combinations are the third most common type for convolve_2d_sr
in --rt encodings after 6, 6 and 4, 4.

Change-Id: Iefaf3a4f759bbcfd61ff60c829225d4aff93556c
---
 av1/common/arm/convolve_neon_i8mm.c | 81 +++++++++++++++++------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index c0957aa29e..2d31054f89 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -1054,6 +1054,22 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
   }
 }
 
+static INLINE int16x4_t convolve6_4_2d_h(uint8x16_t samples,
+                                         const int8x16_t filter,
+                                         const uint8x16_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
 static INLINE int16x8_t convolve6_8_2d_h(uint8x16_t samples,
                                          const int8x16_t filter,
                                          const uint8x16x2_t permute_tbl,
@@ -1153,36 +1169,33 @@ static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
   } while (w != 0);
 }
 
-static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
-                                                 int src_stride, uint8_t *dst,
-                                                 int dst_stride, int w, int h,
-                                                 const int16_t *x_filter_ptr,
-                                                 const int16_t *y_filter_ptr) {
-  const int bd = 8;
-  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
-
+static INLINE void convolve_2d_sr_6tap_4tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
-  const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2);
-  // All 4-tap and bilinear filter values are even, so halve them to reduce
-  // intermediate precision requirements.
-  const int8x8_t x_filter =
-      vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
+  const int bd = 8;
   // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   // shifts - which are generally faster than rounding shifts on modern CPUs.
   // Halve the total because we halved the filter values.
   const int32x4_t horiz_const = vdupq_n_s32(
       ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2);
+  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
-
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     uint8x16_t h_s0, h_s1, h_s2;
     load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
 
-    int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
-    int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
-    int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+    int16x4_t v_s0 = convolve6_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+    int16x4_t v_s1 = convolve6_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+    int16x4_t v_s2 = convolve6_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
 
     src += 3 * src_stride;
 
@@ -1191,13 +1204,13 @@ static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
       load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
 
       int16x4_t v_s3 =
-          convolve4_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+          convolve6_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
       int16x4_t v_s4 =
-          convolve4_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+          convolve6_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
       int16x4_t v_s5 =
-          convolve4_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+          convolve6_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
       int16x4_t v_s6 =
-          convolve4_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+          convolve6_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
 
       int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
       int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
@@ -1219,7 +1232,7 @@ static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
 
     do {
       int height = h;
@@ -1230,11 +1243,11 @@ static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
       load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
 
       int16x8_t v_s0 =
-          convolve4_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
       int16x8_t v_s1 =
-          convolve4_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
       int16x8_t v_s2 =
-          convolve4_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
+          convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
 
       s += 3 * src_stride;
 
@@ -1243,13 +1256,13 @@ static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src,
         load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
 
         int16x8_t v_s3 =
-            convolve4_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
+            convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
         int16x8_t v_s4 =
-            convolve4_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
+            convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
         int16x8_t v_s5 =
-            convolve4_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
+            convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
         int16x8_t v_s6 =
-            convolve4_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
+            convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
 
         uint8x8_t d0 =
             convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
@@ -1326,9 +1339,11 @@ void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
       return;
     }
 
-    if (y_filter_taps <= 4 && x_filter_taps <= 4) {
-      convolve_2d_sr_4tap_neon_i8mm(src_ptr + 2, src_stride, dst, dst_stride, w,
-                                    h, x_filter_ptr, y_filter_ptr);
+    // Used for both 6, 4 and 4, 4 horiz, vert filter tap combinations.
+    if (x_filter_taps <= 6 && y_filter_taps <= 4) {
+      convolve_2d_sr_6tap_4tap_neon_i8mm(src_ptr + 1, src_stride, dst,
+                                         dst_stride, w, h, x_filter_ptr,
+                                         y_filter_ptr);
       return;
     }
 
-- 
GitLab


From e448dcd13a9f534c7291903e9ab18dfc4fea54ad Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 10:38:00 -0700
Subject: [PATCH 316/391] update libwebm to libwebm-1.0.0.31-10-g3b63004

This fixes a build error seen in gcc 15:
3b63004 mkvparser/mkvparser.cc: add missing <cstdint> include

Bug: aomedia:357622679
Change-Id: I6c4a1795d189f9993d4f2c5c9f0375912bc58f0c
---
 third_party/libwebm/README.libaom           | 2 +-
 third_party/libwebm/mkvmuxer/mkvmuxerutil.h | 4 ++--
 third_party/libwebm/mkvparser/mkvparser.cc  | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/third_party/libwebm/README.libaom b/third_party/libwebm/README.libaom
index a038418acd..6e43487540 100644
--- a/third_party/libwebm/README.libaom
+++ b/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: f4b07ec144e61d9089144e8d54b4ecda0219c562
+Version: 3b630045052e1e4d563207ab9e3be8d137c26067
 License: BSD
 License File: LICENSE.TXT
 
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 3355428bd1..85fc2a209e 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -8,9 +8,9 @@
 #ifndef MKVMUXER_MKVMUXERUTIL_H_
 #define MKVMUXER_MKVMUXERUTIL_H_
 
-#include "mkvmuxertypes.h"
+#include <stdint.h>
 
-#include "stdint.h"
+#include "mkvmuxertypes.h"
 
 namespace mkvmuxer {
 class Cluster;
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index eddbc7eb50..042a0c56cd 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -16,6 +16,7 @@
 #include <cfloat>
 #include <climits>
 #include <cmath>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <new>
-- 
GitLab


From f75b7c763faaae6f681cff07c22e4e619d78a7a2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 13:17:07 -0700
Subject: [PATCH 317/391] add missing CONFIG_AV1_HIGHBITDEPTH checks

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: I5d80f30f984dfb300b2302e9883d2f1dc4f1d190
---
 aom_dsp/x86/aom_convolve_copy_sse2.c        | 2 ++
 aom_dsp/x86/intrapred_avx2.c                | 4 ++++
 aom_dsp/x86/masked_sad_intrin_avx2.c        | 2 ++
 aom_dsp/x86/masked_sad_intrin_ssse3.c       | 2 ++
 aom_dsp/x86/obmc_sad_avx2.c                 | 2 ++
 aom_dsp/x86/obmc_sad_sse4.c                 | 2 ++
 aom_dsp/x86/variance_avx2.c                 | 4 ++++
 aom_dsp/x86/variance_sse2.c                 | 2 ++
 av1/common/x86/av1_convolve_horiz_rs_sse4.c | 2 ++
 av1/common/x86/av1_convolve_scale_sse4.c    | 2 ++
 av1/encoder/x86/pickrst_avx2.c              | 2 +-
 av1/encoder/x86/pickrst_sse4.c              | 2 +-
 12 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index 674a37fa49..c4121705c2 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -143,6 +143,7 @@ void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   __m128i s[8];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
@@ -311,3 +312,4 @@ void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
     } while (h);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 4a7b862f32..d9d0a1c377 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -66,6 +66,7 @@ static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
@@ -314,6 +315,7 @@ static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
                                        _mm256_extracti128_si256(dd[i], 1), 0);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
@@ -1040,6 +1042,7 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
 #define PERM2x128(c0, c1) c0 + (c1 << 4)
 
@@ -3424,6 +3427,7 @@ void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   }
   return;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Low bit depth functions
 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index d157d7d625..9bc79d8022 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -196,6 +196,7 @@ MASKSADMXN_AVX2(16, 64)
 MASKSADMXN_AVX2(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int highbd_masked_sad8xh_avx2(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
@@ -387,3 +388,4 @@ HIGHBD_MASKSADMXN_AVX2(32, 8)
 HIGHBD_MASKSADMXN_AVX2(16, 64)
 HIGHBD_MASKSADMXN_AVX2(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index ee56d33272..9fa5b58d10 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -228,6 +228,7 @@ unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
   return (unsigned int)_mm_cvtsi128_si32(res);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // For width a multiple of 8
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -404,3 +405,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
   int sad = _mm_cvtsi128_si32(res);
   return sad;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c
index 471afd28f4..fcedecf5e2 100644
--- a/aom_dsp/x86/obmc_sad_avx2.c
+++ b/aom_dsp/x86/obmc_sad_avx2.c
@@ -148,6 +148,7 @@ OBMCSADWXH(64, 16)
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
                                                 const int pre_stride,
                                                 const int32_t *wsrc,
@@ -269,3 +270,4 @@ HBD_OBMCSADWXH(8, 32)
 HBD_OBMCSADWXH(32, 8)
 HBD_OBMCSADWXH(16, 64)
 HBD_OBMCSADWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c
index ba0f1a0b73..bacc8ec72e 100644
--- a/aom_dsp/x86/obmc_sad_sse4.c
+++ b/aom_dsp/x86/obmc_sad_sse4.c
@@ -148,6 +148,7 @@ OBMCSADWXH(64, 16)
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
                                                      const int pre_stride,
                                                      const int32_t *wsrc,
@@ -267,3 +268,4 @@ HBD_OBMCSADWXH(8, 32)
 HBD_OBMCSADWXH(32, 8)
 HBD_OBMCSADWXH(16, 64)
 HBD_OBMCSADWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 06f11f3c9b..b470135d99 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -240,11 +240,13 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
   const __m256i d =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
   return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                             const __m256i a,
@@ -408,6 +410,7 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
                                                       const __m256i s1,
                                                       const __m256i a) {
@@ -517,6 +520,7 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
     } while (i < height);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
                                    int sstride, int h) {
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 25143a23d9..25f240abca 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -403,6 +403,7 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
@@ -515,6 +516,7 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
     } while (i < height);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
                                    int sstride, int h) {
diff --git a/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
index 4d98a4fb23..f7b0c2281b 100644
--- a/av1/common/x86/av1_convolve_horiz_rs_sse4.c
+++ b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -126,6 +126,7 @@ void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Note: If the crop width is not a multiple of 4, then, unlike the C version,
 // this function will overwrite some of the padding on the right hand side of
 // the frame. This padding appears to be trashed anyway, so this should not
@@ -226,3 +227,4 @@ void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
     }
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 1f33ccaea8..48627a2f69 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -255,6 +255,7 @@ void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
            filter_params_y, conv_params, 8);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // A specialised version of hfilter, the horizontal filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
@@ -496,3 +497,4 @@ void av1_highbd_convolve_2d_scale_sse4_1(
   highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
                   filter_params_y, conv_params, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 1aefc2bac6..1c5439fac0 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -1908,6 +1908,7 @@ void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -2131,7 +2132,6 @@ void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 2ec8d12ced..f52d803358 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -1070,6 +1070,7 @@ void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -1281,7 +1282,6 @@ void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-- 
GitLab


From be1085c7a278886df5f7384c9b0fadfd5c7078e1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 13:18:28 -0700
Subject: [PATCH 318/391] intrapred_avx2.c: delete 2 unused defines

PERM4x64 / PERM2x128

Change-Id: I058cae6e7756e52d1dfcb7f90364a346b185d73a
---
 aom_dsp/x86/intrapred_avx2.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index d9d0a1c377..e045eab616 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -1043,8 +1043,6 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
-#define PERM2x128(c0, c1) c0 + (c1 << 4)
 
 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
-- 
GitLab


From fd08394ed485257a0d75737545e986b6092d90e1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 13:17:28 -0700
Subject: [PATCH 319/391] cmake: exclude highbd files
 w/CONFIG_AV1_HIGHBITDEPTH=0

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: I10c63f9d16ed6fcffab49e6d32330f6dc2149a55
---
 aom_dsp/aom_dsp.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index ef50d4d38f..907f874c56 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -57,7 +57,6 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
@@ -122,6 +121,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
 if(CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
               "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
 
   list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
@@ -242,13 +242,10 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
@@ -319,6 +316,9 @@ if(CONFIG_AV1_ENCODER)
                 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c"
                 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c")
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
-- 
GitLab


From 17fcb3ffd142a88dfd9656ee20bc6ec798ed7b52 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 6 Aug 2024 14:33:05 -0700
Subject: [PATCH 320/391] Fix aom_codec_enc_cfg_t field names in comments

Change-Id: I5f8e0e2b9b9ae0169f60cd3e6afc29301d55b489
---
 av1/av1_cx_iface.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 069091179e..92cd7f40da 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -4676,13 +4676,13 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       25,           // rc_undershoot_pct
       25,           // rc_overshoot_pct
 
-      6000,  // rc_max_buffer_size
-      4000,  // rc_buffer_initial_size
-      5000,  // rc_buffer_optimal_size
+      6000,  // rc_buf_sz
+      4000,  // rc_buf_initial_sz
+      5000,  // rc_buf_optimal_sz
 
-      50,    // rc_two_pass_vbrbias
-      0,     // rc_two_pass_vbrmin_section
-      2000,  // rc_two_pass_vbrmax_section
+      50,    // rc_2pass_vbr_bias_pct
+      0,     // rc_2pass_vbr_minsection_pct
+      2000,  // rc_2pass_vbr_maxsection_pct
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
@@ -4702,7 +4702,7 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       0,                       // use_fixed_qp_offsets
       { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // encoder_cfg
   },
 #endif  // !CONFIG_REALTIME_ONLY
   {
@@ -4747,13 +4747,13 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       25,           // rc_undershoot_pct
       25,           // rc_overshoot_pct
 
-      6000,  // rc_max_buffer_size
-      4000,  // rc_buffer_initial_size
-      5000,  // rc_buffer_optimal_size
+      6000,  // rc_buf_sz
+      4000,  // rc_buf_initial_sz
+      5000,  // rc_buf_optimal_sz
 
-      50,    // rc_two_pass_vbrbias
-      0,     // rc_two_pass_vbrmin_section
-      2000,  // rc_two_pass_vbrmax_section
+      50,    // rc_2pass_vbr_bias_pct
+      0,     // rc_2pass_vbr_minsection_pct
+      2000,  // rc_2pass_vbr_maxsection_pct
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
@@ -4773,7 +4773,7 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       0,                       // use_fixed_qp_offsets
       { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // encoder_cfg
   },
 #if !CONFIG_REALTIME_ONLY
   {
@@ -4818,13 +4818,13 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       25,           // rc_undershoot_pct
       25,           // rc_overshoot_pct
 
-      6000,  // rc_max_buffer_size
-      4000,  // rc_buffer_initial_size
-      5000,  // rc_buffer_optimal_size
+      6000,  // rc_buf_sz
+      4000,  // rc_buf_initial_sz
+      5000,  // rc_buf_optimal_sz
 
-      50,    // rc_two_pass_vbrbias
-      0,     // rc_two_pass_vbrmin_section
-      2000,  // rc_two_pass_vbrmax_section
+      50,    // rc_2pass_vbr_bias_pct
+      0,     // rc_2pass_vbr_minsection_pct
+      2000,  // rc_2pass_vbr_maxsection_pct
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
@@ -4844,7 +4844,7 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       0,                       // use_fixed_qp_offsets
       { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // encoder_cfg
   },
 #endif  // !CONFIG_REALTIME_ONLY
 };
-- 
GitLab


From 7b24d44e6d32dd8c27089956bf9094378caf51d8 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 6 Aug 2024 14:38:59 -0700
Subject: [PATCH 321/391] Document sframe_mode setting is not implemented

Change-Id: Iafad26f05e969b8940103e26e9cd1f91505f1c00
---
 aom/aom_encoder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index bdedf48581..f4c653b928 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -802,6 +802,8 @@ typedef struct aom_codec_enc_cfg {
    *     S-Frame.
    *
    * Otherwise: the considered frame will be made into an S-Frame.
+   *
+   * \attention Not implemented.
    */
   unsigned int sframe_mode;
 
-- 
GitLab


From cb913e11551fab807a539f5fb63dd779bb47d3ea Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 15:07:20 -0700
Subject: [PATCH 322/391] cfl_neon.c: make a function static

This fixes a -Wmissing-prototypes warning.

Bug: aomedia:42302428
Change-Id: I9f148507c35061d8e8e2de231551c262065ccea9
---
 av1/common/arm/cfl_neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index e872038d85..1d44aeaec4 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -134,7 +134,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 #if !AOM_ARCH_AARCH64
-uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+static uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
                       vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
 }
-- 
GitLab


From 914609699ddb349f54d7ceea4425d8d0b061f5a1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 15:16:35 -0700
Subject: [PATCH 323/391] cmake: mv arm enc only srcs to
 AOM_DSP_ENCODER_INTRIN_NEON

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: I2fca9394e7ba5f61650464feb0ccde51d3271e25
---
 aom_dsp/aom_dsp.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 907f874c56..a5cc5b9d3c 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -104,11 +104,9 @@ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c"
@@ -276,6 +274,8 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c"
-- 
GitLab


From 79da697a64eb0d9645cf15e02ededc6b30c41ac2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 16:18:55 -0700
Subject: [PATCH 324/391] variance_neon.c: add missing !CONFIG_REALTIME_ONLY
 check

For 1:4/4:1 sizes.

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: I969917913938db8540256f1aaf1bbe503c89e3da
---
 aom_dsp/arm/variance_neon.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index ae1ad423ce..b37e94d495 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -203,25 +203,19 @@ static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
 
 VARIANCE_WXH_NEON(4, 4, 4)
 VARIANCE_WXH_NEON(4, 8, 5)
-VARIANCE_WXH_NEON(4, 16, 6)
 
 VARIANCE_WXH_NEON(8, 4, 5)
 VARIANCE_WXH_NEON(8, 8, 6)
 VARIANCE_WXH_NEON(8, 16, 7)
-VARIANCE_WXH_NEON(8, 32, 8)
 
-VARIANCE_WXH_NEON(16, 4, 6)
 VARIANCE_WXH_NEON(16, 8, 7)
 VARIANCE_WXH_NEON(16, 16, 8)
 VARIANCE_WXH_NEON(16, 32, 9)
-VARIANCE_WXH_NEON(16, 64, 10)
 
-VARIANCE_WXH_NEON(32, 8, 8)
 VARIANCE_WXH_NEON(32, 16, 9)
 VARIANCE_WXH_NEON(32, 32, 10)
 VARIANCE_WXH_NEON(32, 64, 11)
 
-VARIANCE_WXH_NEON(64, 16, 10)
 VARIANCE_WXH_NEON(64, 32, 11)
 VARIANCE_WXH_NEON(64, 64, 12)
 VARIANCE_WXH_NEON(64, 128, 13)
@@ -229,6 +223,15 @@ VARIANCE_WXH_NEON(64, 128, 13)
 VARIANCE_WXH_NEON(128, 64, 13)
 VARIANCE_WXH_NEON(128, 128, 14)
 
+#if !CONFIG_REALTIME_ONLY
+VARIANCE_WXH_NEON(4, 16, 6)
+VARIANCE_WXH_NEON(8, 32, 8)
+VARIANCE_WXH_NEON(16, 4, 6)
+VARIANCE_WXH_NEON(16, 64, 10)
+VARIANCE_WXH_NEON(32, 8, 8)
+VARIANCE_WXH_NEON(64, 16, 10)
+#endif
+
 #undef VARIANCE_WXH_NEON
 
 // TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
-- 
GitLab


From 022c0fd402dc0bd985258a180442c2518843ab11 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 6 Aug 2024 16:19:52 -0700
Subject: [PATCH 325/391] cmake: rm some arm srcs w/CONFIG_REALTIME_ONLY=1

obmc sad and pickrst are unused with this configuration.

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: I6370877de1292e0d0104f6c3df8e57237a587a9d
---
 aom_dsp/aom_dsp.cmake | 2 ++
 av1/av1.cmake         | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index a5cc5b9d3c..b021b0824f 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -377,7 +377,9 @@ if(CONFIG_AV1_ENCODER)
                      "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c")
 
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+                     "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
                      "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+                     "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
                      "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
   endif()
 endif()
diff --git a/av1/av1.cmake b/av1/av1.cmake
index bed6ab9220..9e0b5a380a 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -545,7 +545,10 @@ if(CONFIG_REALTIME_ONLY)
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
                    "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
+                   "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
                    "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c"
+                   "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
+                   "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
                    "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
-- 
GitLab


From 914bce0a62965c328fc73eea811d197ecb6be1a4 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 7 Aug 2024 13:08:13 -0700
Subject: [PATCH 326/391] av1.cmake: rm arm src w/CONFIG_AV1_HIGHBITDEPTH=0

av1_highbd_quantize_neon.c.

This fixes a -Wmissing-prototypes warning.

Bug: aomedia:42302428
Change-Id: Ifcc4fbb49e2e0c5c02aa3b14331bdc7eca54f22e
---
 av1/av1.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 9e0b5a380a..836281d494 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -357,7 +357,6 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
@@ -507,6 +506,7 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c")
-- 
GitLab


From d9408e2964b81870133fa3346414f48938bb8dbc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 7 Aug 2024 13:13:46 -0700
Subject: [PATCH 327/391] variance_neon_dotprod.c: add missing
 !CONFIG_REALTIME_ONLY check

For 1:4/4:1 sizes.

This fixes some -Wmissing-prototypes warnings.

Bug: aomedia:42302428
Change-Id: Ib67b3f94bac441cbf45a1fd3cd90f5f4238ea411
---
 aom_dsp/arm/variance_neon_dotprod.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/aom_dsp/arm/variance_neon_dotprod.c b/aom_dsp/arm/variance_neon_dotprod.c
index fcb80ad6de..8addf0efe1 100644
--- a/aom_dsp/arm/variance_neon_dotprod.c
+++ b/aom_dsp/arm/variance_neon_dotprod.c
@@ -180,25 +180,19 @@ static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
 
 VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
 VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
-VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
 
 VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
 VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
 VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
-VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
 
-VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
 VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
 VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
 VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
-VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
 
-VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
 VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
 VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
 VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
 
-VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
 VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
 VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
 VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
@@ -206,6 +200,15 @@ VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
 VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
 VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
 
+#if !CONFIG_REALTIME_ONLY
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+#endif
+
 #undef VARIANCE_WXH_NEON_DOTPROD
 
 void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
-- 
GitLab


From 1e3f557d658c14b1923c160fcce64e7fe9b0ef6b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 11:08:00 -0700
Subject: [PATCH 328/391] Add the saturate_cast_double_to_int() function

This was originally added to libvpx in
https://chromium-review.googlesource.com/c/webm/libvpx/+/5673396.

Change-Id: I13f61f688fc4992f24bf6b08f8313261276154d0
---
 aom_dsp/aom_dsp_common.h | 8 ++++++++
 av1/encoder/ratectrl.c   | 5 ++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index ed82e56129..7667704df3 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
 #define AOM_AOM_DSP_AOM_DSP_COMMON_H_
 
+#include <limits.h>
+
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
@@ -92,6 +94,12 @@ static INLINE unsigned int negative_to_zero(int value) {
   return value & ~(value >> (sizeof(value) * 8 - 1));
 }
 
+// Returns the saturating cast of a double value to int.
+static INLINE int saturate_cast_double_to_int(double d) {
+  if (d > INT_MAX) return INT_MAX;
+  return (int)d;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index b9b7e28561..ea402201f1 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2607,9 +2607,8 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   RATE_CONTROL *const rc = &cpi->rc;
   const int MBs = av1_get_MBs(width, height);
 
-  const double avg_frame_bandwidth =
-      round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
-  rc->avg_frame_bandwidth = (int)AOMMIN(avg_frame_bandwidth, INT_MAX);
+  rc->avg_frame_bandwidth = saturate_cast_double_to_int(
+      round(oxcf->rc_cfg.target_bandwidth / cpi->framerate));
 
   int64_t vbr_min_bits =
       (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100;
-- 
GitLab


From 9d756a1117005c064862d53134bfc6d78663c5b4 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Tue, 6 Aug 2024 11:09:49 -0700
Subject: [PATCH 329/391] rtc: Convert the max_consec_drop to time units

This is a more natural unit to use for rtc applications.
At the start of av1_get_one_pass_rt_params() the time units
is converted to frame number (via framerate), which is the
unit used internally in the frame dropper.

Change-Id: I27f5573e6bcb3831f9159798919eab3cc9ab6991
---
 aom/aomcx.h                 | 16 +++++++++++++---
 av1/av1_cx_iface.c          | 16 ++++++++++++++++
 av1/encoder/encoder.h       |  7 +++++++
 av1/encoder/ratectrl.c      |  5 +++++
 av1/ratectrl_rtc.cc         | 14 +++++++++++---
 av1/ratectrl_rtc.h          |  2 +-
 examples/svc_encoder_rtc.cc |  2 +-
 test/ratectrl_rtc_test.cc   | 11 ++++++-----
 8 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 3f3eb643aa..3cf3991213 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1527,9 +1527,10 @@ enum aome_enc_control_id {
    */
   AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
 
-  /*!\brief Codec control to set the maximum number of consecutive frame drops
-   * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
-   * zero has no effect.
+  /*!\brief Codec control to set the maximum number of consecutive frame drops,
+   * in units of frames, allowed for the frame dropper in 1 pass
+   * CBR mode, int parameter. Value of zero has no effect.
+   * Deprecated: use the new control AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR.
    */
   AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
 
@@ -1563,6 +1564,12 @@ enum aome_enc_control_id {
    */
   AV1E_SET_POSTENCODE_DROP_RTC = 168,
 
+  /*!\brief Codec control to set the maximum number of consecutive frame drops,
+   * in units of time (milliseconds), allowed for the frame dropper in 1 pass
+   * CBR mode, int parameter. Value of zero has no effect.
+   */
+  AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR = 169,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -2229,6 +2236,9 @@ AOM_CTRL_USE_TYPE(AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC, int *)
 AOM_CTRL_USE_TYPE(AV1E_SET_POSTENCODE_DROP_RTC, int)
 #define AOM_CTRL_AV1E_SET_POSTENCODE_DROP_RTC
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 92cd7f40da..ce6bbe9618 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2680,6 +2680,20 @@ static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr(
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_max_consec_frame_drop_ms_cbr(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
+  const int max_consec_drop_ms =
+      CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, args);
+  if (max_consec_drop_ms < 0) return AOM_CODEC_INVALID_PARAM;
+  // max_consec_drop_ms will be converted to frame units inside encoder
+  // based on framerate (which can change dynamically).
+  ctx->oxcf.rc_cfg.max_consec_drop_ms = max_consec_drop_ms;
+  cpi->rc.drop_count_consec = 0;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   AV1_PRIMARY *const ppi = ctx->ppi;
@@ -4610,6 +4624,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode },
   { AV1E_SET_AUTO_TILES, ctrl_set_auto_tiles },
   { AV1E_SET_POSTENCODE_DROP_RTC, ctrl_set_postencode_drop_rtc },
+  { AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR,
+    ctrl_set_max_consec_frame_drop_ms_cbr },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 8b3ddb5776..e071b0496b 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -620,6 +620,13 @@ typedef struct {
    * of the target bitrate.
    */
   int vbrmax_section;
+
+  /*!
+   * Indicates the maximum consecutive amount of frame drops, in units of time
+   * (milliseconds). This is converted to frame units internally. Only used in
+   * CBR mode.
+   */
+  int max_consec_drop_ms;
 } RateControlCfg;
 
 /*!\cond */
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index ea402201f1..320d984832 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3679,6 +3679,11 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
   const int layer =
       LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                        svc->number_temporal_layers);
+  if (cpi->oxcf.rc_cfg.max_consec_drop_ms > 0) {
+    rc->max_consec_drop = (int)AOMMIN(
+        ceil(cpi->oxcf.rc_cfg.max_consec_drop_ms * cpi->framerate / 1000),
+        INT_MAX);
+  }
   if (cpi->ppi->use_svc) {
     av1_update_temporal_layer_framerate(cpi);
     av1_restore_layer_context(cpi);
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index 7f1640a77c..e0d896c81b 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -11,12 +11,14 @@
 
 #include "av1/ratectrl_rtc.h"
 
+#include <climits>
 #include <memory>
 #include <new>
 
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/pickcdef.h"
@@ -40,7 +42,7 @@ AV1RateControlRtcConfig::AV1RateControlRtcConfig() {
   max_intra_bitrate_pct = 50;
   max_inter_bitrate_pct = 0;
   frame_drop_thresh = 0;
-  max_consec_drop = 0;
+  max_consec_drop_ms = 0;
   framerate = 30.0;
   ss_number_layers = 1;
   ts_number_layers = 1;
@@ -127,7 +129,10 @@ bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) {
   oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
   oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
   oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
-  rc->max_consec_drop = rc_cfg.max_consec_drop;
+  if (rc_cfg.max_consec_drop_ms > 0) {
+    rc->max_consec_drop = (int)AOMMIN(
+        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000), INT_MAX);
+  }
   cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP;
   oxcf->tool_cfg.bit_depth = AOM_BITS_8;
   oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
@@ -190,7 +195,10 @@ bool AV1RateControlRTC::UpdateRateControl(
   oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
   oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
-  rc->max_consec_drop = rc_cfg.max_consec_drop;
+  if (rc_cfg.max_consec_drop_ms > 0) {
+    rc->max_consec_drop = (int)AOMMIN(
+        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000), INT_MAX);
+  }
   oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
   oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
diff --git a/av1/ratectrl_rtc.h b/av1/ratectrl_rtc.h
index a4b33039cd..e4da5d3b35 100644
--- a/av1/ratectrl_rtc.h
+++ b/av1/ratectrl_rtc.h
@@ -46,7 +46,7 @@ struct AV1RateControlRtcConfig {
   int max_intra_bitrate_pct;
   int max_inter_bitrate_pct;
   int frame_drop_thresh;
-  int max_consec_drop;
+  int max_consec_drop_ms;
   double framerate;
   int layer_target_bitrate[kAV1MaxLayers];
   int ts_rate_decimator[kAV1MaxTemporalLayers];
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index d034b1806b..4fb160e6e0 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -1703,7 +1703,7 @@ int main(int argc, const char **argv) {
     aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1);
   }
 
-  aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX);
+  aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, INT_MAX);
 
   aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE,
                     AOM_FULL_SUPERFRAME_DROP);
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 31ae4509a4..20c4fbb178 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -37,7 +37,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         encoder_exit_(false), layer_frame_cnt_(0), superframe_cnt_(0),
         frame_cnt_(0), dynamic_temporal_layers_(false),
-        dynamic_spatial_layers_(false), num_drops_(0), max_consec_drop_(0),
+        dynamic_spatial_layers_(false), num_drops_(0), max_consec_drop_ms_(0),
         frame_drop_thresh_(0) {
     memset(&svc_params_, 0, sizeof(svc_params_));
     memset(&layer_id_, 0, sizeof(layer_id_));
@@ -67,7 +67,8 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT,
                        rc_cfg_.max_intra_bitrate_pct);
       if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
-      encoder->Control(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, max_consec_drop_);
+      encoder->Control(AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR,
+                       max_consec_drop_ms_);
     }
     // SVC specific settings
     if (use_svc) {
@@ -210,7 +211,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
 
   void RunOneLayerDropFramesCBR() {
     key_interval_ = 10000;
-    max_consec_drop_ = 8;
+    max_consec_drop_ms_ = 250;
     frame_drop_thresh_ = 30;
     SetConfig();
     rc_cfg_.target_bandwidth = 100;
@@ -319,7 +320,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
     rc_cfg_.min_quantizers[0] = 2;
     rc_cfg_.aq_mode = aq_mode_;
     rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
-    rc_cfg_.max_consec_drop = max_consec_drop_;
+    rc_cfg_.max_consec_drop_ms = max_consec_drop_ms_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = 640;
@@ -480,7 +481,7 @@ class RcInterfaceTest : public ::libaom_test::EncoderTest,
   bool dynamic_temporal_layers_;
   bool dynamic_spatial_layers_;
   int num_drops_;
-  int max_consec_drop_;
+  int max_consec_drop_ms_;
   int frame_drop_thresh_;
 };
 
-- 
GitLab


From a3962ef9a6f46f5b322d66c65461e3feb23526b0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 8 Aug 2024 12:29:54 -0700
Subject: [PATCH 330/391] debug_util.h: fix prototype typo

aom_bitstream_queue_get_frame_writee ->
aom_bitstream_queue_set_frame_write

This fixes a -Wmissing-prototypes warning.

Bug: aomedia:42302428
Change-Id: I9a5b7d6b2405269c9597ba877ee132821675d6a3
---
 aom_util/debug_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aom_util/debug_util.h b/aom_util/debug_util.h
index 6a2d318a7d..2166f1215a 100644
--- a/aom_util/debug_util.h
+++ b/aom_util/debug_util.h
@@ -21,7 +21,7 @@ extern "C" {
 #endif
 
 void aom_bitstream_queue_set_frame_write(int frame_idx);
-int aom_bitstream_queue_get_frame_writee(void);
+int aom_bitstream_queue_get_frame_write(void);
 void aom_bitstream_queue_set_frame_read(int frame_idx);
 int aom_bitstream_queue_get_frame_read(void);
 
-- 
GitLab


From 16c7552dd16cd52314e5e9764206da9846dba7ed Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 12:24:02 -0700
Subject: [PATCH 331/391] Define AOM_INLINE and INLINE as inline

Recent versions of Visual Studio support the C99 inline keyword. For
example, the "Inline functions" page for Visual Studio 2015 documents
the inline keyword and says "The __inline keyword is a synonym for
inline."
https://learn.microsoft.com/en-us/cpp/c-language/inline-functions?view=msvc-140

So we can define the AOM_INLINE and INLINE macros as inline without
detecting compiler support.

Bug: aomedia:358402891
Change-Id: I512704acd63a3cfd6cd115ab9ce448ffc04e192b
---
 aom/aom_integer.h                     |  3 +--
 build/cmake/aom_config_defaults.cmake |  2 +-
 build/cmake/aom_configure.cmake       |  3 ---
 build/cmake/compiler_tests.cmake      | 20 --------------------
 4 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/aom/aom_integer.h b/aom/aom_integer.h
index d0ef9280bb..c284947e4d 100644
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -16,11 +16,10 @@
 
 #if defined(_MSC_VER)
 #define AOM_FORCE_INLINE __forceinline
-#define AOM_INLINE __inline
 #else
 #define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
-#define AOM_INLINE inline
 #endif
+#define AOM_INLINE inline
 
 /* Assume platforms have the C99 standard integer types. */
 
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index cb478d0b83..02a5f9f1c4 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -20,7 +20,7 @@ include("${AOM_ROOT}/build/cmake/util.cmake")
 # in this file.
 #
 
-set_aom_detect_var(INLINE "" "Sets INLINE value for current target.")
+set_aom_detect_var(INLINE "inline" "Sets INLINE value for current target.")
 
 # CPUs.
 set_aom_detect_var(AOM_ARCH_AARCH64 0 "Enables AArch64 architecture.")
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index e204eda656..2382974d7b 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -250,9 +250,6 @@ endif()
 # ensure RTCD_CONFIG_* are properly set.
 fix_experiment_configs()
 
-# Test compiler support.
-aom_get_inline("INLINE")
-
 # Don't just check for pthread.h, but use the result of the full pthreads
 # including a linking check in FindThreads above.
 set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT})
diff --git a/build/cmake/compiler_tests.cmake b/build/cmake/compiler_tests.cmake
index 76a2445aba..b96281e415 100644
--- a/build/cmake/compiler_tests.cmake
+++ b/build/cmake/compiler_tests.cmake
@@ -157,23 +157,3 @@ function(aom_check_source_compiles test_name test_source result_var)
     set(${result_var} 0 PARENT_SCOPE)
   endif()
 endfunction()
-
-# When inline support is detected for the current compiler the supported
-# inlining keyword is written to $result in caller scope.
-function(aom_get_inline result)
-  aom_check_source_compiles("inline_check_1"
-                            "static inline void function(void) {}"
-                            HAVE_INLINE_1)
-  if(HAVE_INLINE_1 EQUAL 1)
-    set(${result} "inline" PARENT_SCOPE)
-    return()
-  endif()
-
-  # Check __inline.
-  aom_check_source_compiles("inline_check_2"
-                            "static __inline void function(void) {}"
-                            HAVE_INLINE_2)
-  if(HAVE_INLINE_2 EQUAL 1)
-    set(${result} "__inline" PARENT_SCOPE)
-  endif()
-endfunction()
-- 
GitLab


From 94c721b0024ee8d2bc580af891f5eef714b5f32c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 14:17:35 -0700
Subject: [PATCH 332/391] Use the saturate_cast_double_to_int() function

Just added in https://aomedia-review.googlesource.com/c/aom/+/192483.

Change-Id: I2c95a71402d2356a07cff17bbf2cd6ab8eff6c05
---
 av1/encoder/ratectrl.c | 5 ++---
 av1/ratectrl_rtc.cc    | 9 ++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 320d984832..09129425d0 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3680,9 +3680,8 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
       LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                        svc->number_temporal_layers);
   if (cpi->oxcf.rc_cfg.max_consec_drop_ms > 0) {
-    rc->max_consec_drop = (int)AOMMIN(
-        ceil(cpi->oxcf.rc_cfg.max_consec_drop_ms * cpi->framerate / 1000),
-        INT_MAX);
+    rc->max_consec_drop = saturate_cast_double_to_int(
+        ceil(cpi->oxcf.rc_cfg.max_consec_drop_ms * cpi->framerate / 1000));
   }
   if (cpi->ppi->use_svc) {
     av1_update_temporal_layer_framerate(cpi);
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index e0d896c81b..7957d240f1 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -11,7 +11,6 @@
 
 #include "av1/ratectrl_rtc.h"
 
-#include <climits>
 #include <memory>
 #include <new>
 
@@ -130,8 +129,8 @@ bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) {
   oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
   oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
   if (rc_cfg.max_consec_drop_ms > 0) {
-    rc->max_consec_drop = (int)AOMMIN(
-        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000), INT_MAX);
+    rc->max_consec_drop = saturate_cast_double_to_int(
+        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000));
   }
   cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP;
   oxcf->tool_cfg.bit_depth = AOM_BITS_8;
@@ -196,8 +195,8 @@ bool AV1RateControlRTC::UpdateRateControl(
   oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
   oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
   if (rc_cfg.max_consec_drop_ms > 0) {
-    rc->max_consec_drop = (int)AOMMIN(
-        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000), INT_MAX);
+    rc->max_consec_drop = saturate_cast_double_to_int(
+        ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000));
   }
   oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
   oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
-- 
GitLab


From 12c64e8fe114a98c65a716368dd075d48dbd5b4c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 16:02:19 -0700
Subject: [PATCH 333/391] Remove the INLINE macro

Just use the inline keyword.

Note the changes to build/cmake/aom_config_defaults.cmake and
build/cmake/generate_aom_config_templates.cmake. The rest of the CL was
generated by a series of find/sed commands like these:
  find . -name "*.h" | xargs sed -i -e "s/ INLINE / inline /g"
  find . -name "*.c" | xargs sed -i -e "s/ INLINE / inline /g"
  find . -name "*.cc" | xargs sed -i -e "s/ INLINE / inline /g"
  find . -name "*.h" | xargs sed -i -e "s/ INLINE$/ inline/g"
  ...

Bug: aomedia:358402891
Change-Id: I4c735408ffce6c5f3281a53f0082e2e68d9d495d
---
 aom/src/aom_image.c                           |   2 +-
 aom_dsp/aom_convolve.c                        |   8 +-
 aom_dsp/aom_dsp_common.h                      |  14 +-
 aom_dsp/arm/aom_convolve8_neon.c              |   6 +-
 aom_dsp/arm/aom_convolve8_neon.h              |  12 +-
 aom_dsp/arm/aom_convolve8_neon_dotprod.c      |  22 +-
 aom_dsp/arm/aom_convolve8_neon_i8mm.c         |  22 +-
 aom_dsp/arm/aom_filter.h                      |   2 +-
 aom_dsp/arm/aom_neon_sve2_bridge.h            |   2 +-
 aom_dsp/arm/aom_neon_sve_bridge.h             |   8 +-
 aom_dsp/arm/aom_scaled_convolve8_neon.c       |   4 +-
 .../arm/aom_scaled_convolve8_neon_dotprod.c   |  12 +-
 aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c  |  12 +-
 aom_dsp/arm/blend_neon.h                      |  20 +-
 aom_dsp/arm/blk_sse_sum_neon.c                |   6 +-
 aom_dsp/arm/blk_sse_sum_sve.c                 |   6 +-
 aom_dsp/arm/dist_wtd_avg_neon.h               |   8 +-
 aom_dsp/arm/hadamard_neon.c                   |   2 +-
 aom_dsp/arm/highbd_blend_a64_mask_neon.c      |   4 +-
 aom_dsp/arm/highbd_convolve8_neon.c           |  20 +-
 aom_dsp/arm/highbd_convolve8_neon.h           |  10 +-
 aom_dsp/arm/highbd_convolve8_sve.c            |  26 +-
 aom_dsp/arm/highbd_hadamard_neon.c            |   4 +-
 aom_dsp/arm/highbd_intrapred_neon.c           |  90 ++---
 aom_dsp/arm/highbd_loopfilter_neon.c          |  36 +-
 aom_dsp/arm/highbd_masked_sad_neon.c          |  22 +-
 aom_dsp/arm/highbd_obmc_sad_neon.c            |  16 +-
 aom_dsp/arm/highbd_obmc_variance_neon.c       |  30 +-
 aom_dsp/arm/highbd_quantize_neon.c            |  20 +-
 aom_dsp/arm/highbd_sad_neon.c                 |  30 +-
 aom_dsp/arm/highbd_sadxd_neon.c               |  34 +-
 aom_dsp/arm/highbd_sse_neon.c                 |  18 +-
 aom_dsp/arm/highbd_sse_sve.c                  |  16 +-
 aom_dsp/arm/highbd_variance_neon.c            |  26 +-
 aom_dsp/arm/highbd_variance_neon_dotprod.c    |   4 +-
 aom_dsp/arm/highbd_variance_sve.c             |  18 +-
 aom_dsp/arm/intrapred_neon.c                  |  76 ++---
 aom_dsp/arm/loopfilter_neon.c                 |  10 +-
 aom_dsp/arm/masked_sad4d_neon.c               |  32 +-
 aom_dsp/arm/masked_sad_neon.c                 |  14 +-
 aom_dsp/arm/mem_neon.h                        | 194 +++++------
 aom_dsp/arm/obmc_sad_neon.c                   |  20 +-
 aom_dsp/arm/obmc_variance_neon.c              |  20 +-
 aom_dsp/arm/sad_neon.c                        |  36 +-
 aom_dsp/arm/sad_neon_dotprod.c                |  28 +-
 aom_dsp/arm/sadxd_neon.c                      |  30 +-
 aom_dsp/arm/sadxd_neon_dotprod.c              |  22 +-
 aom_dsp/arm/sse_neon.c                        |  20 +-
 aom_dsp/arm/sse_neon_dotprod.c                |  20 +-
 aom_dsp/arm/sum_neon.h                        |  48 +--
 aom_dsp/arm/sum_squares_neon.c                |  26 +-
 aom_dsp/arm/sum_squares_neon_dotprod.c        |   6 +-
 aom_dsp/arm/sum_squares_sve.c                 |  22 +-
 aom_dsp/arm/transpose_neon.h                  |  64 ++--
 aom_dsp/arm/variance_neon.c                   |  20 +-
 aom_dsp/arm/variance_neon_dotprod.c           |  18 +-
 aom_dsp/bitreader.h                           |  14 +-
 aom_dsp/bitwriter.h                           |  12 +-
 aom_dsp/fft.c                                 |  12 +-
 aom_dsp/flow_estimation/arm/disflow_neon.c    |   6 +-
 aom_dsp/flow_estimation/arm/disflow_neon.h    |   8 +-
 aom_dsp/flow_estimation/arm/disflow_sve.c     |   6 +-
 aom_dsp/flow_estimation/disflow.c             |  18 +-
 aom_dsp/flow_estimation/x86/disflow_avx2.c    |  10 +-
 aom_dsp/flow_estimation/x86/disflow_sse4.c    |  10 +-
 aom_dsp/grain_params.h                        |   2 +-
 aom_dsp/intrapred.c                           |  50 +--
 aom_dsp/loopfilter.c                          |  44 +--
 aom_dsp/mathutils.h                           |  10 +-
 aom_dsp/noise_model.c                         |   4 +-
 aom_dsp/prob.h                                |   4 +-
 aom_dsp/pyramid.c                             |   4 +-
 aom_dsp/recenter.h                            |   8 +-
 aom_dsp/sad.c                                 |   6 +-
 aom_dsp/sad_av1.c                             |  23 +-
 aom_dsp/txfm_common.h                         |   2 +-
 aom_dsp/variance.c                            |  10 +-
 aom_dsp/x86/adaptive_quantize_avx2.c          |  14 +-
 aom_dsp/x86/aom_convolve_copy_avx2.c          |   6 +-
 aom_dsp/x86/aom_convolve_copy_sse2.c          |   6 +-
 aom_dsp/x86/aom_quantize_avx.c                |   2 +-
 aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c     |   8 +-
 aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c    |   2 +-
 aom_dsp/x86/avg_intrin_avx2.c                 |   8 +-
 aom_dsp/x86/avg_intrin_sse2.c                 |  14 +-
 aom_dsp/x86/bitdepth_conversion_avx2.h        |   4 +-
 aom_dsp/x86/bitdepth_conversion_sse2.h        |   8 +-
 aom_dsp/x86/blend_a64_mask_avx2.c             |  64 ++--
 aom_dsp/x86/blend_a64_mask_sse4.c             |  44 +--
 aom_dsp/x86/blend_a64_vmask_sse4.c            |   4 +-
 aom_dsp/x86/blend_mask_sse4.h                 |  20 +-
 aom_dsp/x86/blend_sse4.h                      |  18 +-
 aom_dsp/x86/blk_sse_sum_avx2.c                |   8 +-
 aom_dsp/x86/blk_sse_sum_sse2.c                |   4 +-
 aom_dsp/x86/common_avx2.h                     |   2 +-
 aom_dsp/x86/convolve_avx2.h                   |  40 +--
 aom_dsp/x86/convolve_common_intrin.h          |  12 +-
 aom_dsp/x86/convolve_sse2.h                   |  16 +-
 aom_dsp/x86/convolve_sse4_1.h                 |   4 +-
 aom_dsp/x86/convolve_ssse3.h                  |   4 +-
 aom_dsp/x86/fft_avx2.c                        |  12 +-
 aom_dsp/x86/fft_sse2.c                        |  18 +-
 aom_dsp/x86/fwd_txfm_sse2.h                   |  20 +-
 aom_dsp/x86/highbd_adaptive_quantize_avx2.c   |  16 +-
 aom_dsp/x86/highbd_adaptive_quantize_sse2.c   |  12 +-
 aom_dsp/x86/highbd_convolve_avx2.c            |  58 ++--
 aom_dsp/x86/highbd_intrapred_sse2.c           |  34 +-
 aom_dsp/x86/highbd_loopfilter_sse2.c          |  20 +-
 aom_dsp/x86/highbd_quantize_intrin_avx2.c     |   8 +-
 aom_dsp/x86/highbd_sad_avx2.c                 |   8 +-
 aom_dsp/x86/highbd_variance_sse2.c            |   2 +-
 aom_dsp/x86/highbd_variance_sse4.c            |   2 +-
 aom_dsp/x86/intrapred_avx2.c                  |  32 +-
 aom_dsp/x86/intrapred_sse2.c                  |  46 +--
 aom_dsp/x86/intrapred_ssse3.c                 |  16 +-
 aom_dsp/x86/intrapred_utils.h                 |   4 +-
 aom_dsp/x86/intrapred_x86.h                   |   4 +-
 aom_dsp/x86/jnt_variance_ssse3.c              |   2 +-
 aom_dsp/x86/loopfilter_avx2.c                 |   2 +-
 aom_dsp/x86/loopfilter_sse2.c                 |  14 +-
 aom_dsp/x86/lpf_common_sse2.h                 |  34 +-
 aom_dsp/x86/masked_sad4d_ssse3.c              |   2 +-
 aom_dsp/x86/masked_sad_intrin_avx2.c          |  12 +-
 aom_dsp/x86/masked_sad_intrin_ssse3.c         |   8 +-
 aom_dsp/x86/masked_variance_intrin_ssse3.c    |  10 +-
 aom_dsp/x86/masked_variance_intrin_ssse3.h    |   4 +-
 aom_dsp/x86/mem_sse2.h                        |  38 +--
 aom_dsp/x86/obmc_intrinsic_sse4.h             |   2 +-
 aom_dsp/x86/obmc_intrinsic_ssse3.h            |   8 +-
 aom_dsp/x86/obmc_sad_avx2.c                   |   8 +-
 aom_dsp/x86/obmc_variance_avx2.c              |   4 +-
 aom_dsp/x86/obmc_variance_sse4.c              |  12 +-
 aom_dsp/x86/quantize_avx2.c                   |  10 +-
 aom_dsp/x86/quantize_ssse3.c                  |   4 +-
 aom_dsp/x86/quantize_x86.h                    |  28 +-
 aom_dsp/x86/sad_avx2.c                        |   4 +-
 aom_dsp/x86/sse_avx2.c                        |  18 +-
 aom_dsp/x86/sse_sse4.c                        |  14 +-
 aom_dsp/x86/subtract_avx2.c                   |  10 +-
 aom_dsp/x86/sum_squares_sse2.c                |   6 +-
 aom_dsp/x86/synonyms.h                        |  32 +-
 aom_dsp/x86/synonyms_avx2.h                   |  20 +-
 aom_dsp/x86/transpose_sse2.h                  |  20 +-
 aom_dsp/x86/txfm_common_avx2.h                |  48 +--
 aom_dsp/x86/txfm_common_sse2.h                |   2 +-
 aom_dsp/x86/variance_avx2.c                   |  44 +--
 aom_dsp/x86/variance_sse2.c                   |  36 +-
 aom_mem/aom_mem.h                             |   2 +-
 aom_ports/aarch64_cpudetect.c                 |   2 +-
 aom_ports/aom_timer.h                         |  12 +-
 aom_ports/bitops.h                            |  12 +-
 aom_ports/x86.h                               |  18 +-
 aom_util/aom_pthread.h                        |  32 +-
 aom_util/endian_inl.h                         |   6 +-
 apps/aomdec.c                                 |   2 +-
 av1/av1_cx_iface.c                            |   4 +-
 av1/av1_dx_iface.c                            |   2 +-
 av1/common/alloccommon.c                      |  14 +-
 av1/common/arm/av1_convolve_horiz_rs_neon.c   |   4 +-
 av1/common/arm/av1_convolve_scale_neon.c      |  16 +-
 .../arm/av1_convolve_scale_neon_dotprod.c     |  12 +-
 av1/common/arm/av1_convolve_scale_neon_i8mm.c |  12 +-
 av1/common/arm/av1_inv_txfm_neon.c            |  82 ++---
 av1/common/arm/av1_inv_txfm_neon.h            |   6 +-
 av1/common/arm/cdef_block_neon.c              |  12 +-
 av1/common/arm/cfl_neon.c                     |  32 +-
 av1/common/arm/compound_convolve_neon.c       |  42 +--
 av1/common/arm/compound_convolve_neon.h       |  64 ++--
 .../arm/compound_convolve_neon_dotprod.c      |  16 +-
 av1/common/arm/compound_convolve_neon_i8mm.c  |  16 +-
 av1/common/arm/convolve_neon.c                |  56 +--
 av1/common/arm/convolve_neon.h                |  46 +--
 av1/common/arm/convolve_neon_dotprod.c        |  50 +--
 av1/common/arm/convolve_neon_i8mm.c           |  54 +--
 av1/common/arm/convolve_neon_i8mm.h           |   6 +-
 av1/common/arm/convolve_scale_neon.h          |  32 +-
 av1/common/arm/convolve_sve2.c                |   4 +-
 .../arm/highbd_compound_convolve_neon.c       |  98 +++---
 .../arm/highbd_compound_convolve_neon.h       |   8 +-
 .../arm/highbd_compound_convolve_sve2.c       |  52 +--
 av1/common/arm/highbd_convolve_neon.c         |  96 +++---
 av1/common/arm/highbd_convolve_neon.h         |  14 +-
 av1/common/arm/highbd_convolve_scale_neon.c   |  10 +-
 av1/common/arm/highbd_convolve_sve2.c         |  56 +--
 av1/common/arm/highbd_convolve_sve2.h         |   8 +-
 av1/common/arm/highbd_inv_txfm_neon.c         |  66 ++--
 av1/common/arm/highbd_reconinter_neon.c       |   2 +-
 av1/common/arm/highbd_wiener_convolve_neon.c  |  16 +-
 av1/common/arm/resize_neon.c                  |  16 +-
 av1/common/arm/selfguided_neon.c              |  38 +--
 av1/common/arm/wiener_convolve_neon.c         |  16 +-
 av1/common/av1_common_int.h                   |  94 ++---
 av1/common/av1_inv_txfm1d.h                   |   4 +-
 av1/common/av1_inv_txfm2d.c                   |   6 +-
 av1/common/av1_txfm.h                         |  28 +-
 av1/common/blockd.h                           | 126 +++----
 av1/common/cdef.c                             |   6 +-
 av1/common/cdef.h                             |   4 +-
 av1/common/cdef_block.c                       |   2 +-
 av1/common/cdef_block.h                       |   2 +-
 av1/common/cdef_block_simd.h                  |  10 +-
 av1/common/cfl.c                              |  18 +-
 av1/common/cfl.h                              |  10 +-
 av1/common/common.h                           |   2 +-
 av1/common/convolve.c                         |   6 +-
 av1/common/convolve.h                         |   6 +-
 av1/common/entropy.h                          |   6 +-
 av1/common/entropymode.h                      |   2 +-
 av1/common/entropymv.h                        |   4 +-
 av1/common/filter.h                           |  22 +-
 av1/common/idct.h                             |   2 +-
 av1/common/mv.h                               |  20 +-
 av1/common/mvref_common.c                     |   2 +-
 av1/common/mvref_common.h                     |  30 +-
 av1/common/obmc.h                             |   4 +-
 av1/common/ppc/cfl_ppc.c                      |   2 +-
 av1/common/pred_common.h                      |  56 +--
 av1/common/quant_common.c                     |   2 +-
 av1/common/quant_common.h                     |   2 +-
 av1/common/reconinter.c                       |   6 +-
 av1/common/reconinter.h                       |  28 +-
 av1/common/reconintra.c                       |   2 +-
 av1/common/reconintra.h                       |  18 +-
 av1/common/resize.h                           |   4 +-
 av1/common/restoration.h                      |   4 +-
 av1/common/scale.h                            |  12 +-
 av1/common/scan.h                             |   4 +-
 av1/common/seg_common.h                       |   6 +-
 av1/common/thread_common.c                    |  16 +-
 av1/common/txb_common.h                       |  22 +-
 av1/common/x86/av1_inv_txfm_avx2.c            |  74 ++--
 av1/common/x86/av1_inv_txfm_avx2.h            |   6 +-
 av1/common/x86/av1_inv_txfm_ssse3.c           |  72 ++--
 av1/common/x86/av1_inv_txfm_ssse3.h           |   8 +-
 av1/common/x86/av1_txfm_sse2.h                |  46 +--
 av1/common/x86/av1_txfm_sse4.h                |   6 +-
 av1/common/x86/cdef_block_avx2.c              |   8 +-
 av1/common/x86/cfl_avx2.c                     |  12 +-
 av1/common/x86/cfl_sse2.c                     |   4 +-
 av1/common/x86/cfl_ssse3.c                    |  26 +-
 av1/common/x86/convolve_sse2.c                |  10 +-
 av1/common/x86/filterintra_sse4.c             |   6 +-
 av1/common/x86/highbd_inv_txfm_avx2.c         |  40 +--
 av1/common/x86/highbd_inv_txfm_sse4.c         |  36 +-
 av1/common/x86/highbd_txfm_utility_sse4.h     |  10 +-
 av1/common/x86/highbd_warp_plane_sse4.c       |  18 +-
 av1/common/x86/jnt_convolve_avx2.c            |   4 +-
 av1/common/x86/reconinter_avx2.c              |  10 +-
 av1/common/x86/reconinter_sse4.c              |   2 +-
 av1/common/x86/resize_avx2.c                  |   4 +-
 av1/common/x86/resize_sse2.c                  |   4 +-
 av1/common/x86/resize_ssse3.c                 |  14 +-
 av1/common/x86/selfguided_avx2.c              |   8 +-
 av1/common/x86/selfguided_sse4.c              |   8 +-
 av1/common/x86/warp_plane_avx2.c              |  44 +--
 av1/common/x86/warp_plane_sse4.c              |  38 +--
 av1/decoder/decodeframe.c                     |  26 +-
 av1/decoder/decodemv.c                        |  14 +-
 av1/decoder/decoder.h                         |   4 +-
 av1/decoder/decodetxb.c                       |   8 +-
 av1/decoder/grain_synthesis.c                 |   2 +-
 av1/encoder/aq_cyclicrefresh.h                |   4 +-
 av1/encoder/arm/av1_highbd_quantize_neon.c    |   6 +-
 av1/encoder/arm/av1_temporal_denoiser_neon.c  |   6 +-
 av1/encoder/arm/cnn_neon.c                    |   8 +-
 av1/encoder/arm/encodetxb_neon.c              |  32 +-
 av1/encoder/arm/highbd_pickrst_neon.c         |  22 +-
 av1/encoder/arm/highbd_pickrst_sve.c          |  12 +-
 av1/encoder/arm/highbd_temporal_filter_neon.c |   2 +-
 av1/encoder/arm/pickrst_neon.c                |  22 +-
 av1/encoder/arm/pickrst_neon.h                |  12 +-
 av1/encoder/arm/pickrst_sve.c                 |  12 +-
 av1/encoder/arm/pickrst_sve.h                 |  12 +-
 av1/encoder/arm/quantize_neon.c               |   8 +-
 av1/encoder/arm/rdopt_neon.c                  |   2 +-
 av1/encoder/arm/temporal_filter_neon.c        |   4 +-
 .../arm/temporal_filter_neon_dotprod.c        |   2 +-
 av1/encoder/av1_fwd_txfm2d.c                  |   6 +-
 av1/encoder/av1_noise_estimate.c              |   2 +-
 av1/encoder/av1_quantize.c                    |   4 +-
 av1/encoder/av1_temporal_denoiser.h           |   2 +-
 av1/encoder/bitstream.c                       |  16 +-
 av1/encoder/block.h                           |  12 +-
 av1/encoder/cnn.c                             |  10 +-
 av1/encoder/compound_type.c                   |  34 +-
 av1/encoder/cost.h                            |   2 +-
 av1/encoder/encode_strategy.c                 |   2 +-
 av1/encoder/encodeframe.c                     |   6 +-
 av1/encoder/encodeframe_utils.c               |   2 +-
 av1/encoder/encodeframe_utils.h               |   2 +-
 av1/encoder/encodemb.h                        |   4 +-
 av1/encoder/encodemv.h                        |  10 +-
 av1/encoder/encoder.c                         |  12 +-
 av1/encoder/encoder.h                         | 108 +++---
 av1/encoder/encodetxb.c                       |   2 +-
 av1/encoder/encodetxb.h                       |   2 +-
 av1/encoder/firstpass.c                       |   2 +-
 av1/encoder/firstpass.h                       |   4 +-
 av1/encoder/global_motion.c                   |   4 +-
 av1/encoder/hybrid_fwd_txfm.c                 |   4 +-
 av1/encoder/interp_search.c                   |  22 +-
 av1/encoder/intra_mode_search.c               |   2 +-
 av1/encoder/intra_mode_search_utils.h         |   2 +-
 av1/encoder/level.h                           |   2 +-
 av1/encoder/mcomp.c                           |  58 ++--
 av1/encoder/mcomp.h                           |  22 +-
 av1/encoder/motion_search_facade.c            |   2 +-
 av1/encoder/nonrd_opt.c                       |   4 +-
 av1/encoder/nonrd_opt.h                       |  16 +-
 av1/encoder/nonrd_pickmode.c                  |  18 +-
 av1/encoder/optical_flow.c                    |   4 +-
 av1/encoder/palette.h                         |   4 +-
 av1/encoder/partition_search.c                |  14 +-
 av1/encoder/partition_strategy.c              |   8 +-
 av1/encoder/partition_strategy.h              |   6 +-
 av1/encoder/pass2_strategy.c                  |   8 +-
 av1/encoder/pickcdef.c                        |  10 +-
 av1/encoder/pickcdef.h                        |   4 +-
 av1/encoder/pickrst.c                         |   8 +-
 av1/encoder/pickrst.h                         |   4 +-
 av1/encoder/random.h                          |  10 +-
 av1/encoder/ratectrl.c                        |   2 +-
 av1/encoder/rd.c                              |   8 +-
 av1/encoder/rd.h                              |  26 +-
 av1/encoder/rdopt.c                           |  46 +--
 av1/encoder/rdopt.h                           |  16 +-
 av1/encoder/rdopt_utils.h                     |  24 +-
 av1/encoder/reconinter_enc.c                  |   2 +-
 av1/encoder/saliency_map.c                    |  12 +-
 av1/encoder/sparse_linear_solver.c            |   2 +-
 av1/encoder/temporal_filter.c                 |   6 +-
 av1/encoder/temporal_filter.h                 |   4 +-
 av1/encoder/tokenize.h                        |   4 +-
 av1/encoder/tpl_model.c                       |   2 +-
 av1/encoder/tpl_model.h                       |  16 +-
 av1/encoder/tx_search.c                       |  34 +-
 av1/encoder/txb_rdopt.c                       |   4 +-
 av1/encoder/txb_rdopt_utils.h                 |  18 +-
 av1/encoder/x86/av1_fwd_txfm2d_avx2.c         |  48 +--
 av1/encoder/x86/av1_fwd_txfm2d_sse4.c         |  10 +-
 av1/encoder/x86/av1_fwd_txfm_avx2.h           |   8 +-
 av1/encoder/x86/av1_fwd_txfm_sse2.h           |  14 +-
 av1/encoder/x86/av1_highbd_quantize_avx2.c    |   8 +-
 av1/encoder/x86/av1_highbd_quantize_sse4.c    |   8 +-
 av1/encoder/x86/av1_quantize_avx2.c           |  18 +-
 av1/encoder/x86/av1_quantize_sse2.c           |  10 +-
 av1/encoder/x86/av1_temporal_denoiser_sse2.c  |   6 +-
 av1/encoder/x86/av1_txfm1d_sse4.h             |   4 +-
 av1/encoder/x86/cnn_avx2.c                    |  14 +-
 av1/encoder/x86/encodetxb_sse2.c              |  26 +-
 av1/encoder/x86/error_intrin_avx2.c           |   8 +-
 av1/encoder/x86/highbd_fwd_txfm_avx2.c        |  38 +--
 av1/encoder/x86/highbd_fwd_txfm_sse4.c        |  34 +-
 av1/encoder/x86/ml_avx2.c                     |   8 +-
 av1/encoder/x86/pickrst_avx2.c                |  32 +-
 av1/encoder/x86/pickrst_sse4.c                |  22 +-
 av1/encoder/x86/rdopt_avx2.c                  |   2 +-
 av1/encoder/x86/rdopt_sse4.c                  |   2 +-
 av1/encoder/x86/reconinter_enc_sse2.c         |   2 +-
 av1/encoder/x86/reconinter_enc_ssse3.c        |   2 +-
 av1/encoder/x86/wedge_utils_sse2.c            |   2 +-
 build/cmake/aom_config_defaults.cmake         |   2 -
 .../cmake/generate_aom_config_templates.cmake |   3 +-
 test/av1_inv_txfm1d_test.cc                   |   2 +-
 test/av1_txfm_test.h                          |   2 +-
 test/tile_config_test.cc                      |   2 +-
 test/util.h                                   |   2 +-
 third_party/SVT-AV1/EbMemory_AVX2.h           |  22 +-
 third_party/SVT-AV1/EbMemory_SSE4_1.h         |   6 +-
 third_party/SVT-AV1/convolve_avx2.h           | 320 +++++++++---------
 third_party/SVT-AV1/synonyms.h                |   2 +-
 tools/auto_refactor/av1_preprocess.py         |   2 -
 372 files changed, 3277 insertions(+), 3283 deletions(-)

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 039a012ee3..0aab80c9de 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -19,7 +19,7 @@
 #include "aom/internal/aom_image_internal.h"
 #include "aom_mem/aom_mem.h"
 
-static INLINE unsigned int align_image_dimension(unsigned int d,
+static inline unsigned int align_image_dimension(unsigned int d,
                                                  unsigned int subsampling,
                                                  unsigned int size_align) {
   unsigned int align;
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 2c8a632f74..e139ba1fb8 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -20,13 +20,13 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   return sum;
 }
 
-static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+static inline int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
                                       const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
@@ -163,7 +163,7 @@ void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+static inline int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
                                              const int16_t *b) {
   int sum = 0;
@@ -171,7 +171,7 @@ static INLINE int highbd_vert_scalar_product(const uint16_t *a,
   return sum;
 }
 
-static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+static inline int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index 7667704df3..c279ad1fc0 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -61,23 +61,23 @@ typedef uint8_t qm_val_t;
 typedef int64_t tran_high_t;
 typedef int32_t tran_low_t;
 
-static INLINE uint8_t clip_pixel(int val) {
+static inline uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
 
-static INLINE int clamp(int value, int low, int high) {
+static inline int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+static inline int64_t clamp64(int64_t value, int64_t low, int64_t high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE double fclamp(double value, double low, double high) {
+static inline double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+static inline uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
     default: return (uint16_t)clamp(val, 0, 255);
@@ -90,12 +90,12 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
 // or max(0, value) and might be faster in some cases.
 // Care should be taken since the behavior of right shifting signed type
 // negative value is undefined by C standards and implementation defined,
-static INLINE unsigned int negative_to_zero(int value) {
+static inline unsigned int negative_to_zero(int value) {
   return value & ~(value >> (sizeof(value) * 8 - 1));
 }
 
 // Returns the saturating cast of a double value to int.
-static INLINE int saturate_cast_double_to_int(double d) {
+static inline int saturate_cast_double_to_int(double d) {
   if (d > INT_MAX) return INT_MAX;
   return (int)d;
 }
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index d2f13ff13e..ae4af9e96b 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -26,7 +26,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
+static inline void convolve8_horiz_8tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
                                              const int16_t *filter_x, int w,
@@ -195,7 +195,7 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
+static inline void convolve8_horiz_4tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
                                              const int16_t *filter_x, int w,
@@ -292,7 +292,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
+static inline void convolve8_vert_8tap_neon(const uint8_t *src,
                                             ptrdiff_t src_stride, uint8_t *dst,
                                             ptrdiff_t dst_stride,
                                             const int16_t *filter_y, int w,
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index d1384a76ef..5ebffeb781 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_config.h"
 
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
@@ -38,7 +38,7 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
@@ -59,7 +59,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
+static inline void convolve8_horiz_2tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
                                              const int16_t *filter_x, int w,
@@ -146,7 +146,7 @@ static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x4_t filter) {
   int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
@@ -158,7 +158,7 @@ static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
+static inline void convolve8_vert_4tap_neon(const uint8_t *src,
                                             ptrdiff_t src_stride, uint8_t *dst,
                                             ptrdiff_t dst_stride,
                                             const int16_t *filter_y, int w,
@@ -244,7 +244,7 @@ static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE void convolve8_vert_2tap_neon(const uint8_t *src,
+static inline void convolve8_vert_2tap_neon(const uint8_t *src,
                                             ptrdiff_t src_stride, uint8_t *dst,
                                             ptrdiff_t dst_stride,
                                             const int16_t *filter_y, int w,
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 04b3832b63..7fc9cb1857 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -44,7 +44,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+static inline int16x4_t convolve8_4_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -66,7 +66,7 @@ static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x3_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -95,7 +95,7 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve8_horiz_8tap_neon_dotprod(
+static inline void convolve8_horiz_8tap_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
@@ -149,7 +149,7 @@ static INLINE void convolve8_horiz_8tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -169,7 +169,7 @@ static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
   return vmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+static inline uint8x8_t convolve4_8_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -196,7 +196,7 @@ static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve8_horiz_4tap_neon_dotprod(
+static inline void convolve8_horiz_4tap_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
   const int16x4_t x_filter = vld1_s16(filter_x + 2);
@@ -284,7 +284,7 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b) {
   // Transpose 8-bit elements and concatenate result rows as follows:
   // a0: 00, 01, 02, 03, XX, XX, XX, XX
@@ -308,7 +308,7 @@ static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b = vreinterpretq_s8_s16(a0123);
 }
 
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b0,
                                         int8x16_t *b1) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -335,7 +335,7 @@ static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+static inline int16x4_t convolve8_4_v(const int8x16_t samples_lo,
                                       const int8x16_t samples_hi,
                                       const int8x8_t filters) {
   // The sample range transform and permutation are performed by the caller.
@@ -349,7 +349,7 @@ static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+static inline uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
                                       const int8x16_t samples0_hi,
                                       const int8x16_t samples1_lo,
                                       const int8x16_t samples1_hi,
@@ -370,7 +370,7 @@ static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve8_vert_8tap_neon_dotprod(
+static inline void convolve8_vert_8tap_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 4c8a6cdeee..5b9b88e757 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -40,7 +40,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+static inline int16x4_t convolve8_4_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl) {
   // Permute samples ready for dot product.
@@ -57,7 +57,7 @@ static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x3_t permute_tbl) {
   // Permute samples ready for dot product.
@@ -82,7 +82,7 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve8_horiz_8tap_neon_i8mm(
+static inline void convolve8_horiz_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
@@ -136,7 +136,7 @@ static INLINE void convolve8_horiz_8tap_neon_i8mm(
   }
 }
 
-static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16_t permute_tbl) {
   // Permute samples ready for dot product.
@@ -150,7 +150,7 @@ static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
   return vmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+static inline uint8x8_t convolve4_8_h(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl) {
   // Permute samples ready for dot product.
@@ -172,7 +172,7 @@ static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve8_horiz_4tap_neon_i8mm(
+static inline void convolve8_horiz_4tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
   const int16x4_t x_filter = vld1_s16(filter_x + 2);
@@ -258,7 +258,7 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
                                         uint8x16_t *b) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -283,7 +283,7 @@ static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
   *b = vreinterpretq_u8_u16(a0123);
 }
 
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
                                         uint8x16_t *b0, uint8x16_t *b1) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -310,7 +310,7 @@ static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
   *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
                                       const uint8x16_t samples_hi,
                                       const int8x8_t filters) {
   // Sample permutation is performed by the caller.
@@ -321,7 +321,7 @@ static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
                                       const uint8x16_t samples0_hi,
                                       const uint8x16_t samples1_lo,
                                       const uint8x16_t samples1_hi,
@@ -340,7 +340,7 @@ static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve8_vert_8tap_neon_i8mm(
+static inline void convolve8_vert_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
diff --git a/aom_dsp/arm/aom_filter.h b/aom_dsp/arm/aom_filter.h
index 2573dd803d..54ca6eda6c 100644
--- a/aom_dsp/arm/aom_filter.h
+++ b/aom_dsp/arm/aom_filter.h
@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE int get_filter_taps_convolve8(const int16_t *filter) {
+static inline int get_filter_taps_convolve8(const int16_t *filter) {
   if (filter[0] | filter[7]) {
     return 8;
   }
diff --git a/aom_dsp/arm/aom_neon_sve2_bridge.h b/aom_dsp/arm/aom_neon_sve2_bridge.h
index 5631fcfd11..458abbf262 100644
--- a/aom_dsp/arm/aom_neon_sve2_bridge.h
+++ b/aom_dsp/arm/aom_neon_sve2_bridge.h
@@ -26,7 +26,7 @@
 // remainder of the vector is unused - this approach is still beneficial when
 // compared to a Neon-only solution.
 
-static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1,
+static inline int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1,
                                      uint16x8_t tbl) {
   svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
                                       svset_neonq_s16(svundef_s16(), s1));
diff --git a/aom_dsp/arm/aom_neon_sve_bridge.h b/aom_dsp/arm/aom_neon_sve_bridge.h
index 57650acd51..38dba10149 100644
--- a/aom_dsp/arm/aom_neon_sve_bridge.h
+++ b/aom_dsp/arm/aom_neon_sve_bridge.h
@@ -26,14 +26,14 @@
 // remainder of the vector is unused - this approach is still beneficial when
 // compared to a Neon-only solution.
 
-static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x,
+static inline uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x,
                                        uint16x8_t y) {
   return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
                                    svset_neonq_u16(svundef_u16(), x),
                                    svset_neonq_u16(svundef_u16(), y)));
 }
 
-static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+static inline int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
   return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
                                    svset_neonq_s16(svundef_s16(), x),
                                    svset_neonq_s16(svundef_s16(), y)));
@@ -44,12 +44,12 @@ static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
                                  svset_neonq_s16(svundef_s16(), s0),  \
                                  svset_neonq_s16(svundef_s16(), f), lane))
 
-static INLINE uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) {
+static inline uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) {
   return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s),
                                    svset_neonq_u16(svundef_u16(), tbl)));
 }
 
-static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) {
+static inline int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) {
   return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s),
                                    svset_neonq_u16(svundef_u16(), tbl)));
 }
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon.c b/aom_dsp/arm/aom_scaled_convolve8_neon.c
index 3c11133b87..31eb376d4b 100644
--- a/aom_dsp/arm/aom_scaled_convolve8_neon.c
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void scaled_convolve_horiz_neon(
+static inline void scaled_convolve_horiz_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
     const int x0_q4, const int x_step_q4, int w, int h) {
@@ -146,7 +146,7 @@ static INLINE void scaled_convolve_horiz_neon(
   } while (h > 0);
 }
 
-static INLINE void scaled_convolve_vert_neon(
+static inline void scaled_convolve_vert_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
     const int y0_q4, const int y_step_q4, int w, int h) {
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c
index dd3431d883..d59b9ca3bb 100644
--- a/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, int8x8_t filter) {
   int8x16_t filter_x2 = vcombine_s8(filter, filter);
 
@@ -41,7 +41,7 @@ static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -74,7 +74,7 @@ static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void scaled_convolve_horiz_neon_dotprod(
+static inline void scaled_convolve_horiz_neon_dotprod(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
     const int x0_q4, const int x_step_q4, int w, int h) {
@@ -161,7 +161,7 @@ static INLINE void scaled_convolve_horiz_neon_dotprod(
   } while (h > 0);
 }
 
-static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -189,7 +189,7 @@ static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                         FILTER_BITS - 1);
 }
 
-static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -239,7 +239,7 @@ static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void scaled_convolve_vert_neon_dotprod(
+static inline void scaled_convolve_vert_neon_dotprod(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
     const int y0_q4, const int y_step_q4, int w, int h) {
diff --git a/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c
index 6bf4396b07..2168d38360 100644
--- a/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, int8x8_t filter) {
   int8x16_t filter_x2 = vcombine_s8(filter, filter);
 
@@ -34,7 +34,7 @@ static INLINE uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -58,7 +58,7 @@ static INLINE uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void scaled_convolve_horiz_neon_i8mm(
+static inline void scaled_convolve_horiz_neon_i8mm(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
     const int x0_q4, const int x_step_q4, int w, int h) {
@@ -145,7 +145,7 @@ static INLINE void scaled_convolve_horiz_neon_i8mm(
   } while (h > 0);
 }
 
-static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -167,7 +167,7 @@ static INLINE uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                         FILTER_BITS - 1);
 }
 
-static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
+static inline uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
                                       uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
                                       uint8x8_t s6, uint8x8_t s7,
                                       int8x8_t filter) {
@@ -204,7 +204,7 @@ static INLINE uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void scaled_convolve_vert_neon_i8mm(
+static inline void scaled_convolve_vert_neon_i8mm(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
     const int y0_q4, const int y_step_q4, int w, int h) {
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
index 285dad7e39..36fb02bb7a 100644
--- a/aom_dsp/arm/blend_neon.h
+++ b/aom_dsp/arm/blend_neon.h
@@ -16,7 +16,7 @@
 
 #include "aom_dsp/blend.h"
 
-static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+static inline uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
                                                uint8x16_t b) {
   const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
 
@@ -32,7 +32,7 @@ static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
   return vcombine_u8(blend_u8_lo, blend_u8_hi);
 }
 
-static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+static inline uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
                                              uint8x8_t b) {
   const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
 
@@ -44,7 +44,7 @@ static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+static inline uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
                                                uint16x8_t b) {
   uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
 
@@ -63,7 +63,7 @@ static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
   return vcombine_u16(blend_u16_lo, blend_u16_hi);
 }
 
-static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+static inline uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
                                                uint16x4_t b) {
   const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
 
@@ -75,19 +75,19 @@ static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+static inline uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+static inline uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+static inline uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
   return vrshr_n_u8(vpadd_u8(a, b), 1);
 }
 
-static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+static inline uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
 #if AOM_ARCH_AARCH64
   return vrshrq_n_u8(vpaddq_u8(a, b), 1);
 #else
@@ -97,14 +97,14 @@ static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
 #endif  // AOM_ARCH_AARCH64
 }
 
-static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+static inline uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
                                                   uint8x8_t c, uint8x8_t d) {
   uint8x8_t a_c = vpadd_u8(a, c);
   uint8x8_t b_d = vpadd_u8(b, d);
   return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
 }
 
-static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+static inline uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
                                                     uint8x16_t c,
                                                     uint8x16_t d) {
 #if AOM_ARCH_AARCH64
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
index 3275406a83..140d5e7708 100644
--- a/aom_dsp/arm/blk_sse_sum_neon.c
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
                                             int bh, int *x_sum,
                                             int64_t *x2_sum) {
   int i = bh;
@@ -41,7 +41,7 @@ static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
   *x2_sum = horizontal_long_add_s32x4(sse);
 }
 
-static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
                                             int bh, int *x_sum,
                                             int64_t *x2_sum) {
   int i = bh;
@@ -67,7 +67,7 @@ static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
   *x2_sum = horizontal_long_add_s32x4(sse);
 }
 
-static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
                                               int bw, int bh, int *x_sum,
                                               int64_t *x2_sum) {
   int32x4_t sum = vdupq_n_s32(0);
diff --git a/aom_dsp/arm/blk_sse_sum_sve.c b/aom_dsp/arm/blk_sse_sum_sve.c
index 399b4415e4..8d0878d3d5 100644
--- a/aom_dsp/arm/blk_sse_sum_sve.c
+++ b/aom_dsp/arm/blk_sse_sum_sve.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 
-static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
                                            int bh, int *x_sum,
                                            int64_t *x2_sum) {
   int32x4_t sum = vdupq_n_s32(0);
@@ -39,7 +39,7 @@ static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
   *x2_sum = vaddvq_s64(sse);
 }
 
-static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
                                            int bh, int *x_sum,
                                            int64_t *x2_sum) {
   int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
@@ -63,7 +63,7 @@ static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
   *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
 }
 
-static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride,
+static inline void get_blk_sse_sum_large_sve(const int16_t *data, int stride,
                                              int bw, int bh, int *x_sum,
                                              int64_t *x2_sum) {
   int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
index 28fe81ce71..1ce6e5e639 100644
--- a/aom_dsp/arm/dist_wtd_avg_neon.h
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -17,7 +17,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/enums.h"
 
-static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+static inline uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
                                           uint8x8_t wta, uint8x8_t wtb) {
   uint16x8_t wtd_sum = vmull_u8(a, wta);
 
@@ -26,7 +26,7 @@ static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
   return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
 }
 
-static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+static inline uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
                                             uint16x4_t wta, uint16x4_t wtb) {
   uint32x4_t wtd_sum = vmull_u16(a, wta);
 
@@ -35,7 +35,7 @@ static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
   return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
 }
 
-static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+static inline uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
                                             uint8x16_t wta, uint8x16_t wtb) {
   uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
   uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
@@ -49,7 +49,7 @@ static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
   return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
 }
 
-static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+static inline uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
                                             uint16x8_t wta, uint16x8_t wtb) {
   uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
   uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index ef2cf4fdc7..4b100d23e8 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -16,7 +16,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
-static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+static inline void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
                                          int16x4_t *a2, int16x4_t *a3) {
   const int16x4_t b0 = vhadd_s16(*a0, *a1);
   const int16x4_t b1 = vhsub_s16(*a0, *a1);
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
index 656cbe4ccf..2ba42489e3 100644
--- a/aom_dsp/arm/highbd_blend_a64_mask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/blend.h"
 
 #define HBD_BLEND_A64_D16_MASK(bd, round0_bits)                               \
-  static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8(                  \
+  static inline uint16x8_t alpha_##bd##_blend_a64_d16_u16x8(                  \
       uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) {     \
     const uint16x8_t m_inv =                                                  \
         vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);                   \
@@ -50,7 +50,7 @@
     return blend_u16;                                                         \
   }                                                                           \
                                                                               \
-  static INLINE void highbd_##bd##_blend_a64_d16_mask_neon(                   \
+  static inline void highbd_##bd##_blend_a64_d16_mask_neon(                   \
       uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,          \
       uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  \
       const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw,      \
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index d1413b6402..d1c133c1e4 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -25,11 +25,11 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-static INLINE uint16x4_t
-highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                   const int16x4_t s6, const int16x4_t s7,
-                   const int16x8_t filter, const uint16x4_t max) {
+static inline uint16x4_t highbd_convolve8_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const uint16x4_t max) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
 
@@ -47,11 +47,11 @@ highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t
-highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                   const int16x8_t s6, const int16x8_t s7,
-                   const int16x8_t filter, const uint16x8_t max) {
+static inline uint16x8_t highbd_convolve8_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const uint16x8_t max) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
 
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 9c18f135f8..4267cf0de9 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 #include "aom_dsp/arm/mem_neon.h"
 
-static INLINE void highbd_convolve8_horiz_2tap_neon(
+static inline void highbd_convolve8_horiz_2tap_neon(
     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
   // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
@@ -96,7 +96,7 @@ static INLINE void highbd_convolve8_horiz_2tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4(
+static inline uint16x4_t highbd_convolve4_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
   int32x4_t sum = vmull_lane_s16(s0, filter, 0);
@@ -109,7 +109,7 @@ static INLINE uint16x4_t highbd_convolve4_4(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8(
+static inline uint16x8_t highbd_convolve4_8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
@@ -128,7 +128,7 @@ static INLINE uint16x8_t highbd_convolve4_8(
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve8_vert_4tap_neon(
+static inline void highbd_convolve8_vert_4tap_neon(
     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
   assert(w >= 4 && h >= 4);
@@ -200,7 +200,7 @@ static INLINE void highbd_convolve8_vert_4tap_neon(
   }
 }
 
-static INLINE void highbd_convolve8_vert_2tap_neon(
+static inline void highbd_convolve8_vert_2tap_neon(
     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
   // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index a7d5ad8b73..b5db14be3b 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/arm/highbd_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 
-static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
+static inline uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
                                               uint16x4_t max) {
   int64x2_t sum[4];
 
@@ -39,7 +39,7 @@ static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
+static inline uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
                                               uint16x8_t max) {
   int64x2_t sum[8];
 
@@ -65,7 +65,7 @@ static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve8_horiz_8tap_sve(
+static inline void highbd_convolve8_horiz_8tap_sve(
     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height,
     int bd) {
@@ -140,7 +140,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 };
 // clang-format on
 
-static INLINE uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter,
+static inline uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter,
                                               uint16x8x2_t permute_tbl,
                                               uint16x4_t max) {
   int16x8_t permuted_samples0 = aom_tbl_s16(s, permute_tbl.val[0]);
@@ -157,7 +157,7 @@ static INLINE uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter,
+static inline uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter,
                                               uint16x8_t idx, uint16x8_t max) {
   int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
   int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
@@ -175,7 +175,7 @@ static INLINE uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve8_horiz_4tap_sve(
+static inline void highbd_convolve8_horiz_4tap_sve(
     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height,
     int bd) {
@@ -276,7 +276,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
 };
 
-static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
+static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
                                         int16x4_t s2, int16x4_t s3,
                                         int16x8_t res[2]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
@@ -302,7 +302,7 @@ static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
   res[1] = vreinterpretq_s16_s32(s0123.val[1]);
 }
 
-static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
+static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
                                         int16x8_t s2, int16x8_t s3,
                                         int16x8_t res[4]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
@@ -330,7 +330,7 @@ static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
   res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
 }
 
-static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
+static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
                                   uint8x16_t tbl, int16x8_t res[4]) {
   int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
                            vreinterpretq_s8_s16(t1[0]) };
@@ -347,7 +347,7 @@ static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
   res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl));
 }
 
-static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
+static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
                                   uint8x16_t tbl, int16x8_t res[2]) {
   int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
                            vreinterpretq_s8_s16(t1[0]) };
@@ -358,7 +358,7 @@ static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
   res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
+static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
                                               int16x8_t samples_hi[2],
                                               int16x8_t filter,
                                               uint16x4_t max) {
@@ -377,7 +377,7 @@ static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
+static inline uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
                                               int16x8_t samples_hi[4],
                                               int16x8_t filter,
                                               uint16x8_t max) {
@@ -404,7 +404,7 @@ static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve8_vert_8tap_sve(
+static inline void highbd_convolve8_vert_8tap_sve(
     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
     int bd) {
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index 9c576f2ded..273378f286 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_ports/mem.h"
 
-static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+static inline void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
                                                    int16x8_t *a2, int16x8_t *a3,
                                                    int16x8_t *a4, int16x8_t *a5,
                                                    int16x8_t *a6,
@@ -51,7 +51,7 @@ static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
   *a5 = vsubq_s16(c3, c7);
 }
 
-static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+static inline void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
                                                     int16x4_t a2, int16x4_t a3,
                                                     int16x4_t a4, int16x4_t a5,
                                                     int16x4_t a6, int16x4_t a7,
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 71d133e814..5b8234c481 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -24,21 +24,21 @@
 // -----------------------------------------------------------------------------
 // DC
 
-static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
                                        uint16x4_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1_u16(dst + i * stride, dc);
   }
 }
 
-static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
                                        uint16x8_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u16(dst + i * stride, dc);
   }
 }
 
-static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
                                         uint16x8_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u16(dst + i * stride, dc);
@@ -46,7 +46,7 @@ static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
   }
 }
 
-static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
                                         uint16x8_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u16(dst + i * stride, dc);
@@ -56,7 +56,7 @@ static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
   }
 }
 
-static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
                                         uint16x8_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u16(dst + i * stride, dc);
@@ -70,7 +70,7 @@ static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
   }
 }
 
-static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
+static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
   // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
   // promote first.
   const uint32x4_t b = vpaddlq_u16(a);
@@ -84,7 +84,7 @@ static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
 #endif
 }
 
-static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
+static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
   // Nothing to do since sum is already one vector, but saves needing to
   // special case w=4 or h=4 cases. The combine will be zero cost for a sane
   // compiler since vld1 already sets the top half of a vector to zero as part
@@ -92,19 +92,19 @@ static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
   return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
 }
 
-static INLINE uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
+static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
   // Nothing to do since sum is already one vector, but saves needing to
   // special case w=8 or h=8 cases.
   return vld1q_u16(left);
 }
 
-static INLINE uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
+static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
   const uint16x8_t a1 = vld1q_u16(left + 8);
   return vaddq_u16(a0, a1);  // up to 13 bits
 }
 
-static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
+static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
   const uint16x8_t a1 = vld1q_u16(left + 8);
   const uint16x8_t a2 = vld1q_u16(left + 16);
@@ -114,7 +114,7 @@ static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
   return vaddq_u16(b0, b1);  // up to 14 bits
 }
 
-static INLINE uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
+static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
   const uint16x8_t a1 = vld1q_u16(left + 8);
   const uint16x8_t a2 = vld1q_u16(left + 16);
@@ -169,7 +169,7 @@ HIGHBD_DC_PREDICTOR(64, 64, 7)
 
 #undef HIGHBD_DC_PREDICTOR
 
-static INLINE int divide_using_multiply_shift(int num, int shift1,
+static inline int divide_using_multiply_shift(int num, int shift1,
                                               int multiplier, int shift2) {
   const int interm = num >> shift1;
   return interm * multiplier >> shift2;
@@ -179,7 +179,7 @@ static INLINE int divide_using_multiply_shift(int num, int shift1,
 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
 #define HIGHBD_DC_SHIFT2 17
 
-static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
+static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
                                            uint32_t multiplier) {
   return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
                                      HIGHBD_DC_SHIFT2);
@@ -258,27 +258,27 @@ HIGHBD_DC_PREDICTOR_128(64, 64, q)
 // -----------------------------------------------------------------------------
 // DC_LEFT
 
-static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
+static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
   const uint16x4_t a = vld1_u16(left);   // up to 12 bits
   const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
   return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
 }
 
-static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
+static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
   return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
 }
 
-static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
+static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
   return horizontal_add_and_broadcast_long_u16x8(
       highbd_dc_load_partial_sum_16(left));
 }
 
-static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
+static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
   return horizontal_add_and_broadcast_long_u16x8(
       highbd_dc_load_partial_sum_32(left));
 }
 
-static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
+static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
   return horizontal_add_and_broadcast_long_u16x8(
       highbd_dc_load_partial_sum_64(left));
 }
@@ -364,7 +364,7 @@ DC_PREDICTOR_TOP(64, 64, 6, q)
     vertical##W##xh_neon(dst, stride, above, H);              \
   }
 
-static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
+static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
   uint16x8x2_t x;
   // Clang/gcc uses ldp here.
   x.val[0] = vld1q_u16(ptr);
@@ -372,12 +372,12 @@ static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
   return x;
 }
 
-static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
+static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
   vst1q_u16(ptr, x.val[0]);
   vst1q_u16(ptr + 8, x.val[1]);
 }
 
-static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const above, int height) {
   const uint16x4_t row = vld1_u16(above);
   int y = height;
@@ -389,7 +389,7 @@ static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
   } while (y != 0);
 }
 
-static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const above, int height) {
   const uint16x8_t row = vld1q_u16(above);
   int y = height;
@@ -401,7 +401,7 @@ static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
   } while (y != 0);
 }
 
-static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *const above, int height) {
   const uint16x8x2_t row = load_uint16x8x2(above);
   int y = height;
@@ -413,7 +413,7 @@ static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
   } while (y != 0);
 }
 
-static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
+static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
   uint16x8x4_t x;
   // Clang/gcc uses ldp here.
   x.val[0] = vld1q_u16(ptr);
@@ -423,14 +423,14 @@ static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
   return x;
 }
 
-static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
+static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
   vst1q_u16(ptr, x.val[0]);
   vst1q_u16(ptr + 8, x.val[1]);
   vst1q_u16(ptr + 16, x.val[2]);
   vst1q_u16(ptr + 24, x.val[3]);
 }
 
-static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *const above, int height) {
   const uint16x8x4_t row = load_uint16x8x4(above);
   int y = height;
@@ -442,7 +442,7 @@ static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
   } while (y != 0);
 }
 
-static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *const above, int height) {
   uint16_t *dst32 = dst + 32;
   const uint16x8x4_t row = load_uint16x8x4(above);
@@ -486,7 +486,7 @@ HIGHBD_V_NXM(64, 64)
 // -----------------------------------------------------------------------------
 // H_PRED
 
-static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                       uint16x4_t left) {
   vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
   vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
@@ -494,7 +494,7 @@ static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
   vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
 }
 
-static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
                                       uint16x4_t left) {
   vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
   vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
@@ -502,12 +502,12 @@ static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
   vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
 }
 
-static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
+static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
   vst1q_u16(dst + 0, left);
   vst1q_u16(dst + 8, left);
 }
 
-static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
                                        uint16x4_t left) {
   highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
   highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
@@ -515,14 +515,14 @@ static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
   highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
 }
 
-static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
+static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
   vst1q_u16(dst + 0, left);
   vst1q_u16(dst + 8, left);
   vst1q_u16(dst + 16, left);
   vst1q_u16(dst + 24, left);
 }
 
-static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
                                        uint16x4_t left) {
   highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
   highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
@@ -530,7 +530,7 @@ static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
   highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
 }
 
-static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
+static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
   vst1q_u16(dst + 0, left);
   vst1q_u16(dst + 8, left);
   vst1q_u16(dst + 16, left);
@@ -541,7 +541,7 @@ static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
   vst1q_u16(dst + 56, left);
 }
 
-static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
                                        uint16x4_t left) {
   highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
   highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
@@ -650,7 +650,7 @@ HIGHBD_H_WXH_LARGE(64, 64)
 // -----------------------------------------------------------------------------
 // PAETH
 
-static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
+static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
                                               const uint16_t *const top_row,
                                               const uint16_t *const left_column,
                                               int width, int height) {
@@ -715,7 +715,7 @@ HIGHBD_PAETH_NXM(8, 16)
 HIGHBD_PAETH_NXM(8, 32)
 
 // Select the closest values and collect them.
-static INLINE uint16x8_t select_paeth(const uint16x8_t top,
+static inline uint16x8_t select_paeth(const uint16x8_t top,
                                       const uint16x8_t left,
                                       const uint16x8_t top_left,
                                       const uint16x8_t left_le_top,
@@ -752,7 +752,7 @@ static INLINE uint16x8_t select_paeth(const uint16x8_t top,
 
 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
 
-static INLINE void highbd_paeth16_plus_x_h_neon(
+static inline void highbd_paeth16_plus_x_h_neon(
     uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
     const uint16_t *const left_column, int width, int height) {
   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
@@ -815,11 +815,11 @@ HIGHBD_PAETH_NXM_WIDE(64, 64)
 // SMOOTH
 
 // 256 - v = vneg_s8(v)
-static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
+static inline uint16x4_t negate_s8(const uint16x4_t v) {
   return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
 }
 
-static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *const top_row,
                                           const uint16_t *const left_column,
                                           const int height) {
@@ -851,7 +851,7 @@ static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
 }
 
 // Common code between 8xH and [16|32|64]xH.
-static INLINE void highbd_calculate_pred8(
+static inline void highbd_calculate_pred8(
     uint16_t *dst, const uint32x4_t weighted_corners_low,
     const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
     const uint16x4x2_t weights_x, const uint16_t left_y,
@@ -1136,7 +1136,7 @@ HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
 
 #undef HIGHBD_SMOOTH_V_NXM_WIDE
 
-static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *const top_row,
                                             const uint16_t *const left_column,
                                             const int height) {
@@ -1154,7 +1154,7 @@ static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *const top_row,
                                             const uint16_t *const left_column,
                                             const int height) {
@@ -1307,7 +1307,7 @@ static const uint8_t kLoadMaxShuffles[] = {
 };
 // clang-format on
 
-static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
+static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
                                              int shuffle_idx) {
   uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
   uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
@@ -2497,7 +2497,7 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
                           vrshrn_n_u32(val_hi, (shift)));                 \
   } while (0)
 
-static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
+static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
                                              int max_ofs) {
   uint16x8_t r0;
   uint16x8_t r1;
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 9f38bccee8..b2fcc51251 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -16,12 +16,12 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
-static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
+static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
                                   const int16x4_t high) {
   return vmin_s16(vmax_s16(val, low), high);
 }
 
-static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
+static inline uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
                                                        int bitdepth) {
   const int16x8_t low = vdupq_n_s16(0);
   const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
@@ -30,14 +30,14 @@ static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
 }
 
 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
-static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
+static inline uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
                              const uint16_t thresh) {
   const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
   return vorr_u16(vget_low_u16(a), vget_high_u16(a));
 }
 
 // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
-static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
+static inline uint16x4_t outer_threshold(const uint16x4_t p1,
                                          const uint16x4_t p0,
                                          const uint16x4_t q0,
                                          const uint16x4_t q1,
@@ -52,7 +52,7 @@ static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
 
 // abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
 //   outer_threshold()
-static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
+static inline uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
                                        const uint16_t inner_thresh,
                                        const uint16x4_t outer_mask) {
   const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
@@ -63,7 +63,7 @@ static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
 //   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
 //   outer_threshold()
-static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
+static inline uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
                                        const uint16x8_t abd_p1p2_q1q2,
                                        const uint16_t inner_thresh,
                                        const uint16x4_t outer_mask) {
@@ -77,7 +77,7 @@ static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
 //   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
 //   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
 //   outer_threshold()
-static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
+static inline uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
                                        const uint16x8_t abd_p1p2_q1q2,
                                        const uint16x8_t abd_p2p3_q2q3,
                                        const uint16_t inner_thresh,
@@ -92,7 +92,7 @@ static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
 // -----------------------------------------------------------------------------
 // filterN_masks functions.
 
-static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+static inline void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
                                  const uint16_t hev_thresh,
                                  const uint16x4_t outer_mask,
                                  const uint16_t inner_thresh,
@@ -112,7 +112,7 @@ static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
 // abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
 //   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
 // |flat_thresh| == 4 for 10 bit decode.
-static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
+static inline uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
                                   const uint16x8_t abd_p0p2_q0q2,
                                   const int bitdepth) {
   const int flat_thresh = 1 << (bitdepth - 8);
@@ -121,7 +121,7 @@ static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
   return vand_u16(vget_low_u16(b), vget_high_u16(b));
 }
 
-static INLINE void filter6_masks(
+static inline void filter6_masks(
     const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0,
     const uint16_t hev_thresh, const uint16x4_t outer_mask,
     const uint16_t inner_thresh, const int bitdepth,
@@ -139,7 +139,7 @@ static INLINE void filter6_masks(
 //   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
 //   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
 // |flat_thresh| == 4 for 10 bit decode.
-static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
+static inline uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
                                   const uint16x8_t abd_pn1p0_qn1q0,
                                   const uint16x8_t abd_pn2p0_qn2q0,
                                   const int bitdepth) {
@@ -150,7 +150,7 @@ static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
   return vand_u16(vget_low_u16(c), vget_high_u16(c));
 }
 
-static INLINE void filter8_masks(
+static inline void filter8_masks(
     const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
     const uint16x8_t p0q0, const uint16_t hev_thresh,
     const uint16x4_t outer_mask, const uint16_t inner_thresh,
@@ -175,7 +175,7 @@ static INLINE void filter8_masks(
 // filterN functions.
 
 // Calculate filter4() or filter2() based on |hev_mask|.
-static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+static inline void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
                            const uint16x8_t p1q1, const uint16x4_t hev_mask,
                            int bitdepth, uint16x8_t *const p1q1_result,
                            uint16x8_t *const p0q0_result) {
@@ -361,7 +361,7 @@ void aom_highbd_lpf_vertical_4_dual_neon(
                                  bd);
 }
 
-static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
                            const uint16x8_t p0q0, uint16x8_t *const p1q1_output,
                            uint16x8_t *const p0q0_output) {
   // Sum p1 and q1 output from opposite directions.
@@ -592,7 +592,7 @@ void aom_highbd_lpf_vertical_6_dual_neon(
                                  bd);
 }
 
-static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
                            const uint16x8_t p1q1, const uint16x8_t p0q0,
                            uint16x8_t *const p2q2_output,
                            uint16x8_t *const p1q1_output,
@@ -742,7 +742,7 @@ void aom_highbd_lpf_horizontal_8_dual_neon(
   aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
 }
 
-static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) {
+static inline uint16x8_t reverse_low_half(const uint16x8_t a) {
   return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
 }
 
@@ -846,7 +846,7 @@ void aom_highbd_lpf_vertical_8_dual_neon(
                                  bd);
 }
 
-static INLINE void filter14(
+static inline void filter14(
     const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4,
     const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
     const uint16x8_t p0q0, uint16x8_t *const p5q5_output,
@@ -1080,7 +1080,7 @@ void aom_highbd_lpf_horizontal_14_dual_neon(
   aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
 }
 
-static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
+static inline uint16x8x2_t permute_acdb64(const uint16x8_t ab,
                                           const uint16x8_t cd) {
   uint16x8x2_t acdb;
 #if AOM_ARCH_AARCH64
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
index 89dda4c100..f8ade5f2f8 100644
--- a/aom_dsp/arm/highbd_masked_sad_neon.c
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/blend.h"
 
-static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+static inline uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
                                              const uint16_t *src,
                                              const uint16_t *a,
                                              const uint16_t *b,
@@ -35,7 +35,7 @@ static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
   return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
 }
 
-static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
                                               const uint16_t *src,
                                               const uint16_t *a,
                                               const uint16_t *b,
@@ -44,7 +44,7 @@ static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
   return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
 }
 
-static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+static inline uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
                                               const uint16_t *src,
                                               const uint16_t *a,
                                               const uint16_t *b,
@@ -53,7 +53,7 @@ static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
   return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
 }
 
-static INLINE unsigned int masked_sad_128xh_large_neon(
+static inline unsigned int masked_sad_128xh_large_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -92,7 +92,7 @@ static INLINE unsigned int masked_sad_128xh_large_neon(
   return horizontal_add_u32x4(sad_u32[0]);
 }
 
-static INLINE unsigned int masked_sad_64xh_large_neon(
+static inline unsigned int masked_sad_64xh_large_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -121,7 +121,7 @@ static INLINE unsigned int masked_sad_64xh_large_neon(
   return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
 }
 
-static INLINE unsigned int masked_sad_32xh_large_neon(
+static inline unsigned int masked_sad_32xh_large_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -148,7 +148,7 @@ static INLINE unsigned int masked_sad_32xh_large_neon(
   return horizontal_add_u32x4(sad_u32);
 }
 
-static INLINE unsigned int masked_sad_16xh_large_neon(
+static inline unsigned int masked_sad_16xh_large_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -177,7 +177,7 @@ static INLINE unsigned int masked_sad_16xh_large_neon(
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE unsigned int masked_sad_8xh_large_neon(
+static inline unsigned int masked_sad_8xh_large_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -206,7 +206,7 @@ static INLINE unsigned int masked_sad_8xh_large_neon(
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE unsigned int masked_sad_16xh_small_neon(
+static inline unsigned int masked_sad_16xh_small_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -231,7 +231,7 @@ static INLINE unsigned int masked_sad_16xh_small_neon(
   return horizontal_add_u16x8(sad);
 }
 
-static INLINE unsigned int masked_sad_8xh_small_neon(
+static inline unsigned int masked_sad_8xh_small_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
@@ -256,7 +256,7 @@ static INLINE unsigned int masked_sad_8xh_small_neon(
   return horizontal_add_u16x8(sad);
 }
 
-static INLINE unsigned int masked_sad_4xh_small_neon(
+static inline unsigned int masked_sad_4xh_small_neon(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
     int height) {
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
index 2adf1dedca..03c074f505 100644
--- a/aom_dsp/arm/highbd_obmc_sad_neon.c
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+static inline void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
                                                 const int32_t *mask,
                                                 const int32_t *wsrc,
                                                 uint32x4_t *sum) {
@@ -42,7 +42,7 @@ static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
   *sum = vrsraq_n_u32(*sum, abs_hi, 12);
 }
 
-static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
                                                     int ref_stride,
                                                     const int32_t *wsrc,
                                                     const int32_t *mask,
@@ -64,7 +64,7 @@ static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
                                                     int ref_stride,
                                                     const int32_t *wsrc,
                                                     const int32_t *mask,
@@ -85,7 +85,7 @@ static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
                                                       int ref_stride,
                                                       const int32_t *wsrc,
                                                       const int32_t *mask,
@@ -113,7 +113,7 @@ static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
                                                      int ref_stride,
                                                      const int32_t *wsrc,
                                                      const int32_t *mask,
@@ -121,7 +121,7 @@ static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
   return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
 }
 
-static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
                                                      int ref_stride,
                                                      const int32_t *wsrc,
                                                      const int32_t *mask,
@@ -152,7 +152,7 @@ static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
 }
 
-static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
                                                      int ref_stride,
                                                      const int32_t *wsrc,
                                                      const int32_t *mask,
@@ -160,7 +160,7 @@ static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
   return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
 }
 
-static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+static inline unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
                                                       int ref_stride,
                                                       const int32_t *wsrc,
                                                       const int32_t *mask,
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
index 9088cf5a68..b26052ec56 100644
--- a/aom_dsp/arm/highbd_obmc_variance_neon.c
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+static inline void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
                                                      const int32_t *wsrc,
                                                      const int32_t *mask,
                                                      uint32x4_t *sse,
@@ -61,7 +61,7 @@ static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
 // 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
 // into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
 // 128x64, 128x128 are processed in a different helper function.
-static INLINE void highbd_obmc_variance_xlarge_neon(
+static inline void highbd_obmc_variance_xlarge_neon(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
     int64_t *sum) {
@@ -108,28 +108,28 @@ static INLINE void highbd_obmc_variance_xlarge_neon(
   *sum = horizontal_long_add_s32x4(sum_s32);
 }
 
-static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+static inline void highbd_obmc_variance_xlarge_neon_128xh(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
   highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
                                    sum);
 }
 
-static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+static inline void highbd_obmc_variance_xlarge_neon_64xh(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
   highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
                                    sum);
 }
 
-static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+static inline void highbd_obmc_variance_xlarge_neon_32xh(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
   highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
                                    sum);
 }
 
-static INLINE void highbd_obmc_variance_large_neon(
+static inline void highbd_obmc_variance_large_neon(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
   uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
@@ -158,14 +158,14 @@ static INLINE void highbd_obmc_variance_large_neon(
   *sum = horizontal_long_add_s32x4(sum_s32);
 }
 
-static INLINE void highbd_obmc_variance_neon_128xh(
+static inline void highbd_obmc_variance_neon_128xh(
     const uint8_t *pre, int pre_stride, const int32_t *wsrc,
     const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
   highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
                                   sum);
 }
 
-static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+static inline void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
                                                   int pre_stride,
                                                   const int32_t *wsrc,
                                                   const int32_t *mask, int h,
@@ -173,7 +173,7 @@ static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
   highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
 }
 
-static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+static inline void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
                                                   int pre_stride,
                                                   const int32_t *wsrc,
                                                   const int32_t *mask, int h,
@@ -181,7 +181,7 @@ static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
   highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
 }
 
-static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+static inline void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
                                                   int pre_stride,
                                                   const int32_t *wsrc,
                                                   const int32_t *mask, int h,
@@ -189,7 +189,7 @@ static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
   highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
 }
 
-static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+static inline void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
                                                  int pre_stride,
                                                  const int32_t *wsrc,
                                                  const int32_t *mask, int h,
@@ -212,7 +212,7 @@ static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
   *sum = horizontal_long_add_s32x4(sum_s32);
 }
 
-static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+static inline void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
                                                  int pre_stride,
                                                  const int32_t *wsrc,
                                                  const int32_t *mask, int h,
@@ -237,19 +237,19 @@ static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
   *sum = horizontal_long_add_s32x4(sum_s32);
 }
 
-static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+static inline void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
                                                int *sum, unsigned int *sse) {
   *sum = (int)sum64;
   *sse = (unsigned int)sse64;
 }
 
-static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+static inline void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
                                                 int *sum, unsigned int *sse) {
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
-static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+static inline void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
                                                 int *sum, unsigned int *sse) {
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index f4e8e6c524..7fe236a86e 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -18,7 +18,7 @@
 
 #include "aom_dsp/quantize.h"
 
-static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
+static inline uint32_t sum_abs_coeff(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
@@ -28,11 +28,11 @@ static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
 #endif
 }
 
-static INLINE uint16x4_t
-quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
-           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
-           int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
-           int32x4_t v_quant_shift_s32, int log_scale) {
+static inline uint16x4_t quantize_4(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, int32x4_t v_dequant_s32,
+    int32x4_t v_round_s32, int32x4_t v_zbin_s32, int32x4_t v_quant_shift_s32,
+    int log_scale) {
   const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
   const int32x4_t v_coeff_sign =
       vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
@@ -72,7 +72,7 @@ quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
   return vmovn_u32(nz_qcoeff_mask);
 }
 
-static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
   const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
@@ -82,7 +82,7 @@ static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE void get_min_max_lane_eob(const int16_t *iscan,
+static inline void get_min_max_lane_eob(const int16_t *iscan,
                                         int16x8_t *v_eobmin,
                                         int16x8_t *v_eobmax, uint16x8_t v_mask,
                                         intptr_t n_coeffs) {
@@ -99,7 +99,7 @@ static INLINE void get_min_max_lane_eob(const int16_t *iscan,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
@@ -118,7 +118,7 @@ static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
 }
 
 #if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
-static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
+static inline uint16_t get_min_eob(int16x8_t v_eobmin) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vminvq_s16(v_eobmin);
 #else
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index 7de38e544f..87b5951be8 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
@@ -40,7 +40,7 @@ static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
@@ -62,7 +62,7 @@ static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
@@ -85,7 +85,7 @@ static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
                                                  int src_stride,
                                                  const uint8_t *ref_ptr,
                                                  int ref_stride, int h) {
@@ -113,7 +113,7 @@ static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int w, int h) {
@@ -160,7 +160,7 @@ static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
+static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
                                                       int src_stride,
                                                       const uint8_t *ref_ptr,
                                                       int ref_stride, int h) {
@@ -168,7 +168,7 @@ static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
                                   h);
 }
 
-static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
+static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
                                                      int src_stride,
                                                      const uint8_t *ref_ptr,
                                                      int ref_stride, int h) {
@@ -176,7 +176,7 @@ static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
                                   h);
 }
 
-static INLINE unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
+static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
                                                      int src_stride,
                                                      const uint8_t *ref_ptr,
                                                      int ref_stride, int h) {
@@ -286,7 +286,7 @@ HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
                                               int src_stride,
                                               const uint8_t *ref_ptr,
                                               int ref_stride, int h,
@@ -313,7 +313,7 @@ static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
                                               int src_stride,
                                               const uint8_t *ref_ptr,
                                               int ref_stride, int h,
@@ -341,7 +341,7 @@ static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *ref_ptr,
                                                int ref_stride, int h,
@@ -379,7 +379,7 @@ static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
                                               int src_stride,
                                               const uint8_t *ref_ptr,
                                               int ref_stride, int w, int h,
@@ -440,14 +440,14 @@ static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE unsigned int highbd_sad128xh_avg_neon(
+static inline unsigned int highbd_sad128xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred) {
   return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
                                 h, second_pred);
 }
 
-static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
                                                    int src_stride,
                                                    const uint8_t *ref_ptr,
                                                    int ref_stride, int h,
@@ -456,7 +456,7 @@ static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
                                 second_pred);
 }
 
-static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
                                                    int src_stride,
                                                    const uint8_t *ref_ptr,
                                                    int ref_stride, int h,
diff --git a/aom_dsp/arm/highbd_sadxd_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
index f4f2b77b49..1fd5681b59 100644
--- a/aom_dsp/arm/highbd_sadxd_neon.c
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+static inline void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -51,7 +51,7 @@ static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+static inline void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -84,14 +84,14 @@ static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
 }
 
-static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+static inline void sad8_neon(uint16x8_t src, uint16x8_t ref,
                              uint32x4_t *const sad_sum) {
   uint16x8_t abs_diff = vabdq_u16(src, ref);
   *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -119,7 +119,7 @@ static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
@@ -160,7 +160,7 @@ static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -226,14 +226,14 @@ static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void highbd_sad128xhx4d_large_neon(
+static inline void highbd_sad128xhx4d_large_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
     int ref_stride, uint32_t res[4], int h) {
   highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
                               128, h);
 }
 
-static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
@@ -242,7 +242,7 @@ static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
                               h);
 }
 
-static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
@@ -361,7 +361,7 @@ HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+static inline void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -391,7 +391,7 @@ static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
   res[2] = horizontal_add_u32x4(sum[2]);
 }
 
-static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+static inline void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -419,7 +419,7 @@ static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -450,7 +450,7 @@ static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
@@ -482,7 +482,7 @@ static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
   res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
 }
 
-static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *const ref_ptr[4],
                                                int ref_stride, uint32_t res[4],
@@ -540,14 +540,14 @@ static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
   res[2] = horizontal_add_u32x4(sum[2]);
 }
 
-static INLINE void highbd_sad128xhx3d_large_neon(
+static inline void highbd_sad128xhx3d_large_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
     int ref_stride, uint32_t res[4], int h) {
   highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
                               128, h);
 }
 
-static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
@@ -556,7 +556,7 @@ static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
                               h);
 }
 
-static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+static inline void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *const ref_ptr[4],
                                                 int ref_stride, uint32_t res[4],
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
index 3d9b07a9f5..48d4763a07 100644
--- a/aom_dsp/arm/highbd_sse_neon.c
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -14,7 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+static inline void highbd_sse_8x1_init_neon(const uint16_t *src,
                                             const uint16_t *ref,
                                             uint32x4_t *sse_acc0,
                                             uint32x4_t *sse_acc1) {
@@ -29,7 +29,7 @@ static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
   *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
 }
 
-static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+static inline void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
                                        uint32x4_t *sse_acc0,
                                        uint32x4_t *sse_acc1) {
   uint16x8_t s = vld1q_u16(src);
@@ -43,7 +43,7 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
   *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
 }
 
-static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             int height) {
   uint32x4_t sse[16];
@@ -92,7 +92,7 @@ static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4_x16(sse);
 }
 
-static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            int height) {
   uint32x4_t sse[8];
@@ -125,7 +125,7 @@ static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4_x8(sse);
 }
 
-static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            int height) {
   uint32x4_t sse[8];
@@ -150,7 +150,7 @@ static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4_x8(sse);
 }
 
-static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            int height) {
   uint32x4_t sse[4];
@@ -171,7 +171,7 @@ static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4_x4(sse);
 }
 
-static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int height) {
   uint32x4_t sse[2];
@@ -190,7 +190,7 @@ static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4_x2(sse);
 }
 
-static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int height) {
   // Peel the first loop iteration.
@@ -217,7 +217,7 @@ static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
   return horizontal_long_add_u32x4(sse);
 }
 
-static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int width, int height) {
   // { 0, 1, 2, 3, 4, 5, 6, 7 }
diff --git a/aom_dsp/arm/highbd_sse_sve.c b/aom_dsp/arm/highbd_sse_sve.c
index c2ad589beb..f9a5e2b62d 100644
--- a/aom_dsp/arm/highbd_sse_sve.c
+++ b/aom_dsp/arm/highbd_sse_sve.c
@@ -15,7 +15,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+static inline void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
                                        uint64x2_t *sse) {
   uint16x8_t s = vld1q_u16(src);
   uint16x8_t r = vld1q_u16(ref);
@@ -25,7 +25,7 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
   *sse = aom_udotq_u16(*sse, abs_diff, abs_diff);
 }
 
-static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            int height) {
   uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
@@ -59,7 +59,7 @@ static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(sse[0]);
 }
 
-static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int height) {
   uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
@@ -85,7 +85,7 @@ static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(sse[0]);
 }
 
-static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int height) {
   uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
@@ -107,7 +107,7 @@ static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(sse[0]);
 }
 
-static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           int height) {
   uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
@@ -123,7 +123,7 @@ static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
 }
 
-static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          int height) {
   uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
@@ -140,7 +140,7 @@ static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
 }
 
-static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          int height) {
   uint64x2_t sse = vdupq_n_u64(0);
@@ -160,7 +160,7 @@ static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
   return vaddvq_u64(sse);
 }
 
-static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride,
+static inline int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          int width, int height) {
   svuint64_t sse = svdup_n_u64(0);
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 984780d25c..efbf6dc626 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/variance.h"
 
 // Process a block of width 4 two rows at a time.
-static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+static inline void highbd_variance_4xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
                                             int ref_stride, int h,
@@ -52,7 +52,7 @@ static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
 // For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
 // block sizes can be processed in 32-bit elements (1023*1023*128*32 =
 // 4286582784 for a 128x128 block).
-static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+static inline void highbd_variance_large_neon(const uint16_t *src_ptr,
                                               int src_stride,
                                               const uint16_t *ref_ptr,
                                               int ref_stride, int w, int h,
@@ -87,14 +87,14 @@ static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
       vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
 }
 
-static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+static inline void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             int h, uint64_t *sse,
                                             int64_t *sum) {
   highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
 }
 
-static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+static inline void highbd_variance_16xh_neon(const uint16_t *src,
                                              int src_stride,
                                              const uint16_t *ref,
                                              int ref_stride, int h,
@@ -102,7 +102,7 @@ static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
   highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
 }
 
-static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+static inline void highbd_variance_32xh_neon(const uint16_t *src,
                                              int src_stride,
                                              const uint16_t *ref,
                                              int ref_stride, int h,
@@ -110,7 +110,7 @@ static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
   highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
 }
 
-static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+static inline void highbd_variance_64xh_neon(const uint16_t *src,
                                              int src_stride,
                                              const uint16_t *ref,
                                              int ref_stride, int h,
@@ -118,7 +118,7 @@ static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
   highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
 }
 
-static INLINE void highbd_variance_128xh_neon(const uint16_t *src,
+static inline void highbd_variance_128xh_neon(const uint16_t *src,
                                               int src_stride,
                                               const uint16_t *ref,
                                               int ref_stride, int h,
@@ -136,7 +136,7 @@ static INLINE void highbd_variance_128xh_neon(const uint16_t *src,
 
 // Process a block of any size where the width is divisible by 8, with
 // accumulation into 64-bit elements.
-static INLINE void highbd_variance_xlarge_neon(
+static inline void highbd_variance_xlarge_neon(
     const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
   int32x4_t sum_s32 = vdupq_n_s32(0);
@@ -181,21 +181,21 @@ static INLINE void highbd_variance_xlarge_neon(
   *sse = (uint64_t)horizontal_add_s64x2(sse_s64);
 }
 
-static INLINE void highbd_variance_32xh_xlarge_neon(
+static inline void highbd_variance_32xh_xlarge_neon(
     const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
     int h, uint64_t *sse, int64_t *sum) {
   highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
                               sum);
 }
 
-static INLINE void highbd_variance_64xh_xlarge_neon(
+static inline void highbd_variance_64xh_xlarge_neon(
     const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
     int h, uint64_t *sse, int64_t *sum) {
   highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
                               sum);
 }
 
-static INLINE void highbd_variance_128xh_xlarge_neon(
+static inline void highbd_variance_128xh_xlarge_neon(
     const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
     int h, uint64_t *sse, int64_t *sum) {
   highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse,
@@ -380,7 +380,7 @@ HBD_VARIANCE_WXH_12_NEON(64, 16)
 
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+static inline uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
                                            int src_stride,
                                            const uint16_t *ref_ptr,
                                            int ref_stride, int w, int h,
@@ -449,7 +449,7 @@ HIGHBD_MSE_WXH_NEON(8, 8)
 
 #undef HIGHBD_MSE_WXH_NEON
 
-static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+static inline uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
                                                 uint16x8_t s1, uint16x8_t d0,
                                                 uint16x8_t d1) {
   uint16x8_t e0 = vabdq_u16(s0, d0);
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
index 737f8e5b11..21a46aabb5 100644
--- a/aom_dsp/arm/highbd_variance_neon_dotprod.c
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -16,7 +16,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+static inline uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
                                                     int src_stride,
                                                     const uint16_t *ref_ptr,
                                                     int ref_stride, int h,
@@ -45,7 +45,7 @@ static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
   return *sse;
 }
 
-static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+static inline uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
                                                      int src_stride,
                                                      const uint16_t *ref_ptr,
                                                      int ref_stride, int h,
diff --git a/aom_dsp/arm/highbd_variance_sve.c b/aom_dsp/arm/highbd_variance_sve.c
index 2403832d28..d6833cf3bc 100644
--- a/aom_dsp/arm/highbd_variance_sve.c
+++ b/aom_dsp/arm/highbd_variance_sve.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/variance.h"
 
 // Process a block of width 4 two rows at a time.
-static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+static inline void highbd_variance_4xh_sve(const uint16_t *src_ptr,
                                            int src_stride,
                                            const uint16_t *ref_ptr,
                                            int ref_stride, int h, uint64_t *sse,
@@ -47,7 +47,7 @@ static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
   *sse = vaddvq_s64(sse_s64);
 }
 
-static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
+static inline void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
                                     int32x4_t *sum, int64x2_t *sse) {
   const uint16x8_t s = vld1q_u16(src);
   const uint16x8_t r = vld1q_u16(ref);
@@ -58,7 +58,7 @@ static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
   *sse = aom_sdotq_s16(*sse, diff, diff);
 }
 
-static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+static inline void highbd_variance_8xh_sve(const uint16_t *src_ptr,
                                            int src_stride,
                                            const uint16_t *ref_ptr,
                                            int ref_stride, int h, uint64_t *sse,
@@ -77,7 +77,7 @@ static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
   *sse = vaddvq_s64(sse_s64);
 }
 
-static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+static inline void highbd_variance_16xh_sve(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
                                             int ref_stride, int h,
@@ -97,7 +97,7 @@ static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
   *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
 }
 
-static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr,
+static inline void highbd_variance_large_sve(const uint16_t *src_ptr,
                                              int src_stride,
                                              const uint16_t *ref_ptr,
                                              int ref_stride, int w, int h,
@@ -133,21 +133,21 @@ static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr,
   *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2]));
 }
 
-static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+static inline void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             int h, uint64_t *sse,
                                             int64_t *sum) {
   highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
 }
 
-static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+static inline void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             int h, uint64_t *sse,
                                             int64_t *sum) {
   highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
 }
 
-static INLINE void highbd_variance_128xh_sve(const uint16_t *src,
+static inline void highbd_variance_128xh_sve(const uint16_t *src,
                                              int src_stride,
                                              const uint16_t *ref,
                                              int ref_stride, int h,
@@ -319,7 +319,7 @@ HBD_VARIANCE_WXH_12_SVE(64, 16)
 #undef HBD_VARIANCE_WXH_10_SVE
 #undef HBD_VARIANCE_WXH_12_SVE
 
-static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+static inline uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
                                           int src_stride,
                                           const uint16_t *ref_ptr,
                                           int ref_stride, int w, int h,
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 7fd82a1b82..a81fb1cee6 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -27,14 +27,14 @@
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
+static inline uint16x8_t dc_load_sum_4(const uint8_t *in) {
   const uint8x8_t a = load_u8_4x1(in);
   const uint16x4_t p0 = vpaddl_u8(a);
   const uint16x4_t p1 = vpadd_u16(p0, p0);
   return vcombine_u16(p1, vdup_n_u16(0));
 }
 
-static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
                                 uint8x8_t dc) {
   for (int i = 0; i < h; ++i) {
     store_u8_4x1(dst + i * stride, dc);
@@ -77,7 +77,7 @@ void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
+static inline uint16x8_t dc_load_sum_8(const uint8_t *in) {
   // This isn't used in the case where we want to load both above and left
   // vectors, since we want to avoid performing the reduction twice.
   const uint8x8_t a = vld1_u8(in);
@@ -87,7 +87,7 @@ static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
   return vcombine_u16(p2, vdup_n_u16(0));
 }
 
-static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
+static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
 #if AOM_ARCH_AARCH64
   // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
   // instruction, however the addv instruction is usually slightly more
@@ -104,7 +104,7 @@ static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
 #endif
 }
 
-static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
                                 uint8x8_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1_u8(dst + i * stride, dc);
@@ -148,7 +148,7 @@ void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
+static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
   const uint8x16_t a = vld1q_u8(in);
   // delay the remainder of the reduction until
   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
@@ -156,11 +156,11 @@ static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
   return vpaddlq_u8(a);
 }
 
-static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+static inline uint16x8_t dc_load_sum_16(const uint8_t *in) {
   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
 }
 
-static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
                                  uint8x16_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
@@ -207,7 +207,7 @@ void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
+static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
   const uint8x16_t a0 = vld1q_u8(in);
   const uint8x16_t a1 = vld1q_u8(in + 16);
   // delay the remainder of the reduction until
@@ -216,11 +216,11 @@ static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
   return vpadalq_u8(vpaddlq_u8(a0), a1);
 }
 
-static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+static inline uint16x8_t dc_load_sum_32(const uint8_t *in) {
   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
 }
 
-static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
                                  uint8x16_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
@@ -268,7 +268,7 @@ void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 64x64
 
-static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
+static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
   const uint8x16_t a0 = vld1q_u8(in);
   const uint8x16_t a1 = vld1q_u8(in + 16);
   const uint8x16_t a2 = vld1q_u8(in + 32);
@@ -281,11 +281,11 @@ static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
   return vaddq_u16(p01, p23);
 }
 
-static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+static inline uint16x8_t dc_load_sum_64(const uint8_t *in) {
   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
 }
 
-static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
                                  uint8x16_t dc) {
   for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
@@ -340,13 +340,13 @@ void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
 
 #define DC_SHIFT2 16
 
-static INLINE int divide_using_multiply_shift(int num, int shift1,
+static inline int divide_using_multiply_shift(int num, int shift1,
                                               int multiplier, int shift2) {
   const int interm = num >> shift1;
   return interm * multiplier >> shift2;
 }
 
-static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
                                         int shift1, int multiplier) {
   const int expected_dc = divide_using_multiply_shift(
       sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
@@ -579,28 +579,28 @@ DC_PREDICTOR_TOP(64, 32, 6, q)
 
 // -----------------------------------------------------------------------------
 
-static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
                                uint8x8_t d0) {
   for (int i = 0; i < h; ++i) {
     store_u8_4x1(dst + i * stride, d0);
   }
 }
 
-static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
                                uint8x8_t d0) {
   for (int i = 0; i < h; ++i) {
     vst1_u8(dst + i * stride, d0);
   }
 }
 
-static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
                                 uint8x16_t d0) {
   for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, d0);
   }
 }
 
-static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
                                 uint8x16_t d0, uint8x16_t d1) {
   for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + 0, d0);
@@ -609,7 +609,7 @@ static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
   }
 }
 
-static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
                                 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
                                 uint8x16_t d3) {
   for (int i = 0; i < h; ++i) {
@@ -757,7 +757,7 @@ void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
-static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
   store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
@@ -768,7 +768,7 @@ static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
 }
 
-static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
   vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
@@ -779,7 +779,7 @@ static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
 }
 
-static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
   vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
@@ -790,7 +790,7 @@ static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
 }
 
-static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
   dst += stride;
@@ -816,7 +816,7 @@ static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
 }
 
-static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
@@ -1378,7 +1378,7 @@ static const uint8_t kLoadMaxShuffles[] = {
 };
 // clang-format on
 
-static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
+static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
                                              int shuffle_idx) {
   uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
   uint8x16_t src = vld1q_u8(ptr);
@@ -2385,7 +2385,7 @@ void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 // SMOOTH_PRED
 
 // 256 - v = vneg_s8(v)
-static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
+static inline uint8x8_t negate_s8(const uint8x8_t v) {
   return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
 }
 
@@ -2424,14 +2424,14 @@ static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
   } while (++y != height);
 }
 
-static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
+static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
                                        const uint16x8_t weighted_left_tr) {
   // Maximum value of each parameter: 0xFF00
   const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
   return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
 }
 
-static INLINE uint8x8_t calculate_weights_and_pred(
+static inline uint8x8_t calculate_weights_and_pred(
     const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
     const uint8x8_t bottom_left, const uint8x8_t weights_x,
     const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
@@ -2489,7 +2489,7 @@ SMOOTH_NXM(8, 32)
 
 #undef SMOOTH_NXM
 
-static INLINE uint8x16_t calculate_weights_and_predq(
+static inline uint8x16_t calculate_weights_and_predq(
     const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
     const uint8x8_t weights_y, const uint8x16_t weights_x,
     const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
@@ -2513,7 +2513,7 @@ static INLINE uint8x16_t calculate_weights_and_predq(
 }
 
 // 256 - v = vneg_s8(v)
-static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
+static inline uint8x16_t negate_s8q(const uint8x16_t v) {
   return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
 }
 
@@ -2682,7 +2682,7 @@ SMOOTH_V_NXM(8, 32)
 
 #undef SMOOTH_V_NXM
 
-static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+static inline uint8x16_t calculate_vertical_weights_and_pred(
     const uint8x16_t top, const uint8x8_t weights_y,
     const uint16x8_t weighted_bl) {
   const uint16x8_t pred_low =
@@ -2832,7 +2832,7 @@ SMOOTH_H_NXM(8, 32)
 
 #undef SMOOTH_H_NXM
 
-static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
+static inline uint8x16_t calculate_horizontal_weights_and_pred(
     const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
     const uint8x16_t scaled_weights_x) {
   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
@@ -2938,7 +2938,7 @@ SMOOTH_H_NXM_WIDE(64, 64)
 // -----------------------------------------------------------------------------
 // PAETH
 
-static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
                                        const uint8_t *const top_row,
                                        const uint8_t *const left_column,
                                        int width, int height) {
@@ -3008,7 +3008,7 @@ PAETH_NXM(8, 32)
 
 // Calculate X distance <= TopLeft distance and pack the resulting mask into
 // uint8x8_t.
-static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
+static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist,
                                        const uint16x8_t top_left_dist_low,
                                        const uint16x8_t top_left_dist_high) {
   const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
@@ -3017,7 +3017,7 @@ static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
 }
 
 // Select the closest values and collect them.
-static INLINE uint8x16_t select_paeth(const uint8x16_t top,
+static inline uint8x16_t select_paeth(const uint8x16_t top,
                                       const uint8x16_t left,
                                       const uint8x16_t top_left,
                                       const uint8x16_t left_le_top,
@@ -3056,7 +3056,7 @@ static INLINE uint8x16_t select_paeth(const uint8x16_t top,
   const uint8x16_t top_le_top_left_##num = x_le_top_left( \
       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
 
-static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
                                          const uint8_t *const top_row,
                                          const uint8_t *const left_column,
                                          int width, int height) {
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index c54ae64a1e..6beb73ca0d 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
-static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+static inline uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
                                  uint8x8_t p0q0, const uint8_t blimit,
                                  const uint8_t limit) {
   // Calculate mask values for four samples
@@ -52,7 +52,7 @@ static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
   return mask_8x8;
 }
 
-static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+static inline uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
                                   const uint8_t blimit, const uint8_t limit) {
   uint32x2x2_t p0q0_p1q1;
   uint16x8_t temp_16x8;
@@ -82,7 +82,7 @@ static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
   return mask_8x8;
 }
 
-static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+static inline uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
                                        uint8x8_t p1q1, uint8x8_t p0q0) {
   const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
   uint8x8_t flat_8x8, temp_8x8;
@@ -98,7 +98,7 @@ static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
   return flat_8x8;
 }
 
-static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+static inline uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
                                        uint8x8_t p0q0) {
   const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
   uint8x8_t flat_8x8, temp_8x8;
@@ -113,7 +113,7 @@ static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
   return flat_8x8;
 }
 
-static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
                                          uint8x8_t p0q0, const uint8_t blimit,
                                          const uint8_t limit) {
   // Calculate mask3 values for four samples
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index c3afc55bfe..f680c6faa0 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -18,7 +18,7 @@
 #include "mem_neon.h"
 #include "sum_neon.h"
 
-static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
                                               const uint8x16_t s0,
                                               const uint8x16_t a0,
                                               const uint8x16_t b0,
@@ -35,7 +35,7 @@ static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
   return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
 }
 
-static INLINE void masked_inv_sadwxhx4d_large_neon(
+static inline void masked_inv_sadwxhx4d_large_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
@@ -102,7 +102,7 @@ static INLINE void masked_inv_sadwxhx4d_large_neon(
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void masked_inv_sad128xhx4d_neon(
+static inline void masked_inv_sad128xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -110,7 +110,7 @@ static INLINE void masked_inv_sad128xhx4d_neon(
                                   mask, mask_stride, res, 128, h, 32);
 }
 
-static INLINE void masked_inv_sad64xhx4d_neon(
+static inline void masked_inv_sad64xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -118,7 +118,7 @@ static INLINE void masked_inv_sad64xhx4d_neon(
                                   mask, mask_stride, res, 64, h, 64);
 }
 
-static INLINE void masked_sadwxhx4d_large_neon(
+static inline void masked_sadwxhx4d_large_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
@@ -185,7 +185,7 @@ static INLINE void masked_sadwxhx4d_large_neon(
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride,
                                            const uint8_t *second_pred,
@@ -195,7 +195,7 @@ static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
                               mask, mask_stride, res, 128, h, 32);
 }
 
-static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
                                           const uint8_t *const ref[4],
                                           int ref_stride,
                                           const uint8_t *second_pred,
@@ -205,7 +205,7 @@ static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
                               mask, mask_stride, res, 64, h, 64);
 }
 
-static INLINE void masked_inv_sad32xhx4d_neon(
+static inline void masked_inv_sad32xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -250,7 +250,7 @@ static INLINE void masked_inv_sad32xhx4d_neon(
   vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
 }
 
-static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
                                           const uint8_t *const ref[4],
                                           int ref_stride,
                                           const uint8_t *second_pred,
@@ -297,7 +297,7 @@ static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
 }
 
-static INLINE void masked_inv_sad16xhx4d_neon(
+static inline void masked_inv_sad16xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -334,7 +334,7 @@ static INLINE void masked_inv_sad16xhx4d_neon(
   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
 }
 
-static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
                                           const uint8_t *const ref[4],
                                           int ref_stride,
                                           const uint8_t *second_pred,
@@ -373,7 +373,7 @@ static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
 }
 
-static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
+static inline uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
                                              const uint8x8_t a0,
                                              const uint8x8_t b0,
                                              const uint8x8_t m0) {
@@ -385,7 +385,7 @@ static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
   return vabal_u8(sad, blend_u8, s0);
 }
 
-static INLINE void masked_inv_sad8xhx4d_neon(
+static inline void masked_inv_sad8xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -416,7 +416,7 @@ static INLINE void masked_inv_sad8xhx4d_neon(
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
                                          const uint8_t *const ref[4],
                                          int ref_stride,
                                          const uint8_t *second_pred,
@@ -450,7 +450,7 @@ static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-static INLINE void masked_inv_sad4xhx4d_neon(
+static inline void masked_inv_sad4xhx4d_neon(
     const uint8_t *src, int src_stride, const uint8_t *const ref[4],
     int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, uint32_t res[4], int h) {
@@ -482,7 +482,7 @@ static INLINE void masked_inv_sad4xhx4d_neon(
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
                                          const uint8_t *const ref[4],
                                          int ref_stride,
                                          const uint8_t *second_pred,
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 7f84acf041..75b99601f8 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/blend.h"
 
-static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
                                               const uint8_t *src,
                                               const uint8_t *a,
                                               const uint8_t *b,
@@ -35,7 +35,7 @@ static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
   return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
 }
 
-static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
                                              const uint8_t *a, int a_stride,
                                              const uint8_t *b, int b_stride,
                                              const uint8_t *m, int m_stride,
@@ -70,7 +70,7 @@ static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
          horizontal_long_add_u16x8(sad[6], sad[7]);
 }
 
-static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
                                             const uint8_t *a, int a_stride,
                                             const uint8_t *b, int b_stride,
                                             const uint8_t *m, int m_stride,
@@ -97,7 +97,7 @@ static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
          horizontal_long_add_u16x8(sad[2], sad[3]);
 }
 
-static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
                                             const uint8_t *a, int a_stride,
                                             const uint8_t *b, int b_stride,
                                             const uint8_t *m, int m_stride,
@@ -120,7 +120,7 @@ static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u16x8(sad);
 }
 
-static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
                                             const uint8_t *a, int a_stride,
                                             const uint8_t *b, int b_stride,
                                             const uint8_t *m, int m_stride,
@@ -142,7 +142,7 @@ static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u16x8(sad);
 }
 
-static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
                                            const uint8_t *a, int a_stride,
                                            const uint8_t *b, int b_stride,
                                            const uint8_t *m, int m_stride,
@@ -171,7 +171,7 @@ static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u16x4(sad);
 }
 
-static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
                                            const uint8_t *a, int a_stride,
                                            const uint8_t *b, int b_stride,
                                            const uint8_t *m, int m_stride,
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 9734f8bd52..c4a1ed27d4 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -33,23 +33,23 @@
 
 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
 
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
                          vld1q_u8(ptr + 2 * 16) } };
   return res;
 }
 
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
 
-static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
   uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
   return res;
 }
 
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
@@ -57,14 +57,14 @@ static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
 
 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
 #if __GNUC__ < 8
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
 #endif  // __GNUC__ < 8
 
 #if __GNUC__ < 9
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
                          vld1q_u8(ptr + 2 * 16) } };
   return res;
@@ -73,7 +73,7 @@ static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
 
 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
@@ -81,7 +81,7 @@ static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 #endif  // defined(__GNUC__) && !defined(__clang__)
 
-static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1) {
   vst1_u8(s, s0);
   s += p;
@@ -89,19 +89,19 @@ static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
   s += p;
 }
 
-static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
+static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
 }
 
 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
-static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
+static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
   uint8x8_t ret = vdup_n_u8(0);
   ret = vreinterpret_u8_u32(
       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
   return ret;
 }
 
-static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
   uint8x8_t ret = vdup_n_u8(0);
   ret = vreinterpret_u8_u32(
       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
@@ -111,7 +111,7 @@ static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
   return ret;
 }
 
-static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
   uint16x4_t ret = vdup_n_u16(0);
   ret = vreinterpret_u16_u32(
       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
@@ -121,7 +121,7 @@ static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
   return ret;
 }
 
-static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,
                                uint8x8_t *const s4, uint8x8_t *const s5,
@@ -143,7 +143,7 @@ static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
   *s7 = vld1_u8(s);
 }
 
-static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,
                                uint8x8_t *const s4, uint8x8_t *const s5,
@@ -163,7 +163,7 @@ static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
   *s6 = vld1_u8(s);
 }
 
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3) {
   *s0 = vld1_u8(s);
@@ -175,7 +175,7 @@ static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
   *s3 = vld1_u8(s);
 }
 
-static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2) {
   *s0 = vld1_u8(s);
@@ -185,7 +185,7 @@ static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
   *s2 = vld1_u8(s);
 }
 
-static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3) {
   *s0 = vld1_u16(s);
@@ -198,7 +198,7 @@ static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
   s += p;
 }
 
-static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
+static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3,
                                 uint16x4_t *const s4, uint16x4_t *const s5,
@@ -218,21 +218,21 @@ static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
   *s6 = vld1_u16(s);
 }
 
-static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
+static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1) {
   *s0 = vld1q_s16(s);
   s += p;
   *s1 = vld1q_s16(s);
 }
 
-static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1) {
   *s0 = vld1q_u16(s);
   s += p;
   *s1 = vld1q_u16(s);
 }
 
-static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2) {
   *s0 = vld1q_u16(s);
@@ -242,7 +242,7 @@ static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
   *s2 = vld1q_u16(s);
 }
 
-static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3) {
   *s0 = vld1q_u16(s);
@@ -255,7 +255,7 @@ static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
   s += p;
 }
 
-static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
                                  int16x4_t *const s0, int16x4_t *const s1,
                                  int16x4_t *const s2, int16x4_t *const s3,
                                  int16x4_t *const s4, int16x4_t *const s5,
@@ -287,7 +287,7 @@ static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
   *s11 = vld1_s16(s);
 }
 
-static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
                                  int16x4_t *const s0, int16x4_t *const s1,
                                  int16x4_t *const s2, int16x4_t *const s3,
                                  int16x4_t *const s4, int16x4_t *const s5,
@@ -317,7 +317,7 @@ static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
   *s10 = vld1_s16(s);
 }
 
-static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
+static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
                                  uint16x4_t *const s0, uint16x4_t *const s1,
                                  uint16x4_t *const s2, uint16x4_t *const s3,
                                  uint16x4_t *const s4, uint16x4_t *const s5,
@@ -347,7 +347,7 @@ static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
   *s10 = vld1_u16(s);
 }
 
-static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
                                 int16x4_t *const s4, int16x4_t *const s5,
@@ -369,7 +369,7 @@ static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
   *s7 = vld1_s16(s);
 }
 
-static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
                                 int16x4_t *const s4, int16x4_t *const s5,
@@ -389,7 +389,7 @@ static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
   *s6 = vld1_s16(s);
 }
 
-static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
                                 int16x4_t *const s4, int16x4_t *const s5) {
@@ -406,7 +406,7 @@ static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
   *s5 = vld1_s16(s);
 }
 
-static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
                                 int16x4_t *const s4) {
@@ -421,7 +421,7 @@ static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
   *s4 = vld1_s16(s);
 }
 
-static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3,
                                 uint16x4_t *const s4) {
@@ -437,7 +437,7 @@ static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
   s += p;
 }
 
-static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,
                                uint8x8_t *const s4) {
@@ -452,7 +452,7 @@ static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
   *s4 = vld1_u8(s);
 }
 
-static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3,
                                 uint16x8_t *const s4) {
@@ -468,7 +468,7 @@ static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
   s += p;
 }
 
-static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3) {
   *s0 = vld1_s16(s);
@@ -480,7 +480,7 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1_s16(s);
 }
 
-static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2) {
   *s0 = vld1_s16(s);
@@ -490,7 +490,7 @@ static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p,
   *s2 = vld1_s16(s);
 }
 
-static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1, const uint8x8_t s2,
                                 const uint8x8_t s3, const uint8x8_t s4,
                                 const uint8x8_t s5, const uint8x8_t s6,
@@ -512,7 +512,7 @@ static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
   vst1_u8(s, s7);
 }
 
-static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1, const uint8x8_t s2,
                                 const uint8x8_t s3) {
   vst1_u8(s, s0);
@@ -524,7 +524,7 @@ static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
   vst1_u8(s, s3);
 }
 
-static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
                                  const uint8x16_t s1, const uint8x16_t s2,
                                  const uint8x16_t s3) {
   vst1q_u8(s, s0);
@@ -536,7 +536,7 @@ static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
   vst1q_u8(s, s3);
 }
 
-static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1,
                                  const uint16x8_t s2, const uint16x8_t s3,
                                  const uint16x8_t s4, const uint16x8_t s5,
@@ -558,7 +558,7 @@ static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s7);
 }
 
-static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x4_t s0, const uint16x4_t s1,
                                  const uint16x4_t s2) {
   vst1_u16(s, s0);
@@ -568,7 +568,7 @@ static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
   vst1_u16(s, s2);
 }
 
-static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x4_t s0, const uint16x4_t s1,
                                  const uint16x4_t s2, const uint16x4_t s3) {
   vst1_u16(s, s0);
@@ -580,14 +580,14 @@ static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
   vst1_u16(s, s3);
 }
 
-static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1) {
   vst1q_u16(s, s0);
   s += dst_stride;
   vst1q_u16(s, s1);
 }
 
-static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1,
                                  const uint16x8_t s2) {
   vst1q_u16(s, s0);
@@ -597,7 +597,7 @@ static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s2);
 }
 
-static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1,
                                  const uint16x8_t s2, const uint16x8_t s3) {
   vst1q_u16(s, s0);
@@ -609,7 +609,7 @@ static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s3);
 }
 
-static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1,
                                  const int16x8_t s2, const int16x8_t s3,
                                  const int16x8_t s4, const int16x8_t s5,
@@ -631,7 +631,7 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
   vst1q_s16(s, s7);
 }
 
-static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x4_t s0, const int16x4_t s1,
                                  const int16x4_t s2, const int16x4_t s3) {
   vst1_s16(s, s0);
@@ -643,7 +643,7 @@ static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
   vst1_s16(s, s3);
 }
 
-static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
+static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x4_t s0, const int16x4_t s1,
                                  const int16x4_t s2, const int16x4_t s3,
                                  const int16x4_t s4, const int16x4_t s5,
@@ -665,7 +665,7 @@ static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
   vst1_s16(s, s7);
 }
 
-static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1,
                                  const int16x8_t s2, const int16x8_t s3) {
   vst1q_s16(s, s0);
@@ -677,14 +677,14 @@ static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
   vst1q_s16(s, s3);
 }
 
-static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
+static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1) {
   vst1q_s16(s, s0);
   s += dst_stride;
   vst1q_s16(s, s1);
 }
 
-static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
                                 uint8x8_t *const s0, uint8x8_t *const s1,
                                 uint8x8_t *const s2, uint8x8_t *const s3,
                                 uint8x8_t *const s4, uint8x8_t *const s5,
@@ -714,7 +714,7 @@ static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
   *s10 = vld1_u8(s);
 }
 
-static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
                                  int16x8_t *const s0, int16x8_t *const s1,
                                  int16x8_t *const s2, int16x8_t *const s3,
                                  int16x8_t *const s4, int16x8_t *const s5,
@@ -741,7 +741,7 @@ static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
   *s9 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
                                  int16x8_t *const s0, int16x8_t *const s1,
                                  int16x8_t *const s2, int16x8_t *const s3,
                                  int16x8_t *const s4, int16x8_t *const s5,
@@ -771,7 +771,7 @@ static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
   *s10 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
                                  int16x8_t *const s0, int16x8_t *const s1,
                                  int16x8_t *const s2, int16x8_t *const s3,
                                  int16x8_t *const s4, int16x8_t *const s5,
@@ -803,7 +803,7 @@ static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
   *s11 = vld1q_s16(s);
 }
 
-static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
+static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
                                  uint16x8_t *const s0, uint16x8_t *const s1,
                                  uint16x8_t *const s2, uint16x8_t *const s3,
                                  uint16x8_t *const s4, uint16x8_t *const s5,
@@ -833,7 +833,7 @@ static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
   *s10 = vld1q_u16(s);
 }
 
-static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
                                 int16x8_t *const s4, int16x8_t *const s5,
@@ -855,7 +855,7 @@ static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
   *s7 = vld1q_s16(s);
 }
 
-static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
+static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3,
                                 uint16x8_t *const s4, uint16x8_t *const s5,
@@ -875,7 +875,7 @@ static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
   *s6 = vld1q_u16(s);
 }
 
-static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
                                 int16x8_t *const s4, int16x8_t *const s5,
@@ -895,7 +895,7 @@ static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
   *s6 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
                                 int16x8_t *const s4, int16x8_t *const s5) {
@@ -912,7 +912,7 @@ static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
   *s5 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
                                 int16x8_t *const s4) {
@@ -927,7 +927,7 @@ static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
   *s4 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3) {
   *s0 = vld1q_s16(s);
@@ -939,7 +939,7 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1q_s16(s);
 }
 
-static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
+static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2) {
   *s0 = vld1q_s16(s);
@@ -976,7 +976,7 @@ static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
 #endif
 
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   uint32_t a;
   memcpy(&a, buf, 4);
   buf += stride;
@@ -987,7 +987,7 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
 }
 
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
@@ -1005,7 +1005,7 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   return vreinterpretq_u8_u32(a_u32);
 }
 
-static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
   uint16_t a;
   uint16x4_t a_u16;
 
@@ -1017,7 +1017,7 @@ static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
   return vreinterpret_u8_u16(a_u16);
 }
 
-static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
   uint32_t a;
   uint32x2_t a_u32;
 
@@ -1027,7 +1027,7 @@ static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
   return vreinterpret_u8_u32(a_u32);
 }
 
-static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
   uint32_t a;
   uint32x2_t a_u32;
 
@@ -1036,7 +1036,7 @@ static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
   return vreinterpret_u8_u32(a_u32);
 }
 
-static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
   uint16_t a;
   uint16x4_t a_u32;
 
@@ -1045,7 +1045,7 @@ static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
   return vreinterpret_u8_u16(a_u32);
 }
 
-static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x2_t a_u32;
 
@@ -1057,14 +1057,14 @@ static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
   return vreinterpret_u8_u32(a_u32);
 }
 
-static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
                                          uint8x8_t *tu0, uint8x8_t *tu1) {
   *tu0 = load_unaligned_u8_4x2(buf, stride);
   buf += 2 * stride;
   *tu1 = load_unaligned_u8_4x2(buf, stride);
 }
 
-static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
+static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
                                          uint8x8_t *tu0, uint8x8_t *tu1,
                                          uint8x8_t *tu2) {
   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
@@ -1072,7 +1072,7 @@ static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
   *tu2 = load_unaligned_u8_4x2(buf, stride);
 }
 
-static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
                                          uint8x8_t *tu0, uint8x8_t *tu1,
                                          uint8x8_t *tu2, uint8x8_t *tu3) {
   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
@@ -1080,7 +1080,7 @@ static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
 }
 
-static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
                                 uint8x16_t *const s4, uint8x16_t *const s5,
@@ -1102,7 +1102,7 @@ static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
   *s7 = vld1q_u8(s);
 }
 
-static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
                                 uint8x16_t *const s4) {
@@ -1117,7 +1117,7 @@ static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
   *s4 = vld1q_u8(s);
 }
 
-static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3) {
   *s0 = vld1q_u8(s);
@@ -1129,7 +1129,7 @@ static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
   *s3 = vld1q_u8(s);
 }
 
-static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
+static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2) {
   *s0 = vld1q_u8(s);
@@ -1139,7 +1139,7 @@ static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
   *s2 = vld1q_u8(s);
 }
 
-static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
                                 uint16x8_t *s6, uint16x8_t *s7) {
@@ -1160,7 +1160,7 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
   *s7 = vld1q_u16(s);
 }
 
-static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
+static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
                                  uint16x8_t *const s0, uint16x8_t *const s1,
                                  uint16x8_t *const s2, uint16x8_t *const s3,
                                  uint16x8_t *const s4, uint16x8_t *const s5,
@@ -1178,7 +1178,7 @@ static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
   *s7 = vld1q_u16(s + 8);
 }
 
-static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
                                                 int stride) {
   uint32_t a;
   uint32x2_t a_u32;
@@ -1191,7 +1191,7 @@ static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
   return vreinterpret_u16_u32(a_u32);
 }
 
-static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
   uint64_t a;
   uint64x1_t a_u64 = vdup_n_u64(0);
   memcpy(&a, buf, 8);
@@ -1199,7 +1199,7 @@ static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
   return vreinterpret_u16_u64(a_u64);
 }
 
-static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
+static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
                                                 uint32_t stride) {
   uint64_t a;
   uint64x2_t a_u64;
@@ -1214,14 +1214,14 @@ static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
   return vreinterpretq_u16_u64(a_u64);
 }
 
-static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
                                           uint16x8_t *tu0, uint16x8_t *tu1) {
   *tu0 = load_unaligned_u16_4x2(buf, stride);
   buf += 2 * stride;
   *tu1 = load_unaligned_u16_4x2(buf, stride);
 }
 
-static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
   *s1 = vld1q_s32(s);
   s += p;
@@ -1232,7 +1232,7 @@ static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
   *s4 = vld1q_s32(s);
 }
 
-static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
   vst1q_s32(s, s1);
   s += p;
@@ -1243,7 +1243,7 @@ static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
   vst1q_s32(s, s4);
 }
 
-static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
                                 uint32x4_t *s2, uint32x4_t *s3,
                                 uint32x4_t *s4) {
   *s1 = vld1q_u32(s);
@@ -1255,7 +1255,7 @@ static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
   *s4 = vld1q_u32(s);
 }
 
-static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
   vst1q_u32(s, s1);
   s += p;
@@ -1266,7 +1266,7 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
   vst1q_u32(s, s4);
 }
 
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
   const int32x4_t v0 = vld1q_s32(buf);
   const int32x4_t v1 = vld1q_s32(buf + 4);
   const int16x4_t s0 = vmovn_s32(v0);
@@ -1274,19 +1274,19 @@ static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
   return vcombine_s16(s0, s1);
 }
 
-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
   vst1q_s32(buf, v0);
   vst1q_s32(buf + 4, v1);
 }
 
-static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
   const int32x4_t v0 = vmovl_s16(a);
   vst1q_s32(buf, v0);
 }
 
-static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
+static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
                                               int16x8_t indices) {
   // Recent Clang and GCC versions correctly identify that this zero-broadcast
   // is redundant. Alternatively we could load and broadcast the zeroth element
@@ -1336,17 +1336,17 @@ static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
   } while (0)
 
 // Store the low 16-bits from a single vector.
-static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
+static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
   store_u8_2x1_lane(dst, src, 0);
 }
 
 // Store the low 32-bits from a single vector.
-static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
+static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
   store_u8_4x1_lane(dst, src, 0);
 }
 
 // Store two blocks of 16-bits from a single vector.
-static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
                                          uint8x8_t src) {
   store_u8_2x1_lane(dst, src, 0);
   dst += dst_stride;
@@ -1354,7 +1354,7 @@ static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
 }
 
 // Store two blocks of 32-bits from a single vector.
-static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
                                          uint8x8_t src) {
   store_u8_4x1_lane(dst, src, 0);
   dst += stride;
@@ -1362,7 +1362,7 @@ static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
 }
 
 // Store four blocks of 32-bits from a single vector.
-static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
                                          uint8x16_t src) {
   store_u8_4x1_lane(dst, vget_low_u8(src), 0);
   dst += stride;
@@ -1374,12 +1374,12 @@ static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
 }
 
 // Store the low 32-bits from a single vector.
-static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
   store_u16_2x1_lane(dst, src, 0);
 }
 
 // Store two blocks of 32-bits from a single vector.
-static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
                                           uint16x4_t src) {
   store_u16_2x1_lane(dst, src, 0);
   dst += dst_stride;
@@ -1387,7 +1387,7 @@ static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
 }
 
 // Store two blocks of 64-bits from a single vector.
-static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
                                           uint16x8_t src) {
   store_u16_4x1_lane(dst, src, 0);
   dst += dst_stride;
@@ -1395,7 +1395,7 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
 }
 
 // Store two blocks of 64-bits from a single vector.
-static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
+static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
                                           int16x8_t src) {
   store_s16_4x1_lane(dst, src, 0);
   dst += dst_stride;
diff --git a/aom_dsp/arm/obmc_sad_neon.c b/aom_dsp/arm/obmc_sad_neon.c
index 61b01f2aaa..dd977e878e 100644
--- a/aom_dsp/arm/obmc_sad_neon.c
+++ b/aom_dsp/arm/obmc_sad_neon.c
@@ -15,7 +15,7 @@
 #include "mem_neon.h"
 #include "sum_neon.h"
 
-static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask,
+static inline void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask,
                                          const int32_t *wsrc, uint32x4_t *sum) {
   int32x4_t wsrc_lo = vld1q_s32(wsrc);
   int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
@@ -49,7 +49,7 @@ DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
   12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
 };
 
-static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
+static inline void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
                                          uint32x4_t ref_u32_hi,
                                          const int32_t *mask,
                                          const int32_t *wsrc,
@@ -69,7 +69,7 @@ static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
   sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12);
 }
 
-static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_large_neon(const uint8_t *ref,
                                                int ref_stride,
                                                const int32_t *wsrc,
                                                const int32_t *mask, int width,
@@ -112,7 +112,7 @@ static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
 
 #else  // !AOM_ARCH_AARCH64
 
-static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_large_neon(const uint8_t *ref,
                                                int ref_stride,
                                                const int32_t *wsrc,
                                                const int32_t *mask, int width,
@@ -146,35 +146,35 @@ static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
 
 #endif  // AOM_ARCH_AARCH64
 
-static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
                                                int ref_stride,
                                                const int32_t *wsrc,
                                                const int32_t *mask, int h) {
   return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
 }
 
-static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_64xh_neon(const uint8_t *ref,
                                               int ref_stride,
                                               const int32_t *wsrc,
                                               const int32_t *mask, int h) {
   return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
 }
 
-static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_32xh_neon(const uint8_t *ref,
                                               int ref_stride,
                                               const int32_t *wsrc,
                                               const int32_t *mask, int h) {
   return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h);
 }
 
-static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref,
+static inline unsigned int obmc_sad_16xh_neon(const uint8_t *ref,
                                               int ref_stride,
                                               const int32_t *wsrc,
                                               const int32_t *mask, int h) {
   return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
 }
 
-static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
+static inline unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
                                              const int32_t *wsrc,
                                              const int32_t *mask, int height) {
   uint32x4_t sum = vdupq_n_u32(0);
@@ -194,7 +194,7 @@ static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
   return horizontal_add_u32x4(sum);
 }
 
-static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride,
+static inline unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride,
                                              const int32_t *wsrc,
                                              const int32_t *mask, int height) {
   uint32x4_t sum = vdupq_n_u32(0);
diff --git a/aom_dsp/arm/obmc_variance_neon.c b/aom_dsp/arm/obmc_variance_neon.c
index 95b364cfc3..23770fceb5 100644
--- a/aom_dsp/arm/obmc_variance_neon.c
+++ b/aom_dsp/arm/obmc_variance_neon.c
@@ -16,7 +16,7 @@
 #include "mem_neon.h"
 #include "sum_neon.h"
 
-static INLINE void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
+static inline void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
                                               const int32_t *wsrc,
                                               const int32_t *mask,
                                               int32x4_t *ssev,
@@ -67,7 +67,7 @@ DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
   12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
 };
 
-static INLINE void obmc_variance_8x1_s32_neon(
+static inline void obmc_variance_8x1_s32_neon(
     int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc,
     const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) {
   int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
@@ -94,7 +94,7 @@ static INLINE void obmc_variance_8x1_s32_neon(
   *ssev = vmlaq_s32(*ssev, round_hi, round_hi);
 }
 
-static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
                                             const int32_t *wsrc,
                                             const int32_t *mask, int width,
                                             int height, unsigned *sse,
@@ -142,7 +142,7 @@ static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
 
 #else  // !AOM_ARCH_AARCH64
 
-static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
                                             const int32_t *wsrc,
                                             const int32_t *mask, int width,
                                             int height, unsigned *sse,
@@ -182,35 +182,35 @@ static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
 
 #endif  // AOM_ARCH_AARCH64
 
-static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
                                             const int32_t *wsrc,
                                             const int32_t *mask, int h,
                                             unsigned *sse, int *sum) {
   obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum);
 }
 
-static INLINE void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int h,
                                            unsigned *sse, int *sum) {
   obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
 }
 
-static INLINE void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int h,
                                            unsigned *sse, int *sum) {
   obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
 }
 
-static INLINE void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int h,
                                            unsigned *sse, int *sum) {
   obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
 }
 
-static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int h,
                                           unsigned *sse, int *sum) {
@@ -232,7 +232,7 @@ static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
   *sum = horizontal_add_s32x4(sumv);
 }
 
-static INLINE void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int h,
                                           unsigned *sse, int *sum) {
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index ef19908518..639b9ba3b2 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          int h) {
   // We use 8 accumulators to prevent overflow for large values of 'h', as well
@@ -91,7 +91,7 @@ static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -134,7 +134,7 @@ static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
   uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
@@ -158,7 +158,7 @@ static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
   uint16x8_t sum = vdupq_n_u16(0);
@@ -178,7 +178,7 @@ static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
   uint16x8_t sum = vdupq_n_u16(0);
@@ -197,7 +197,7 @@ static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
   uint16x8_t sum = vdupq_n_u16(0);
@@ -296,7 +296,7 @@ SAD_SKIP_WXH_NEON(64, 16)
 
 #undef SAD_SKIP_WXH_NEON
 
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
                                              int src_stride,
                                              const uint8_t *ref_ptr,
                                              int ref_stride, int h,
@@ -389,7 +389,7 @@ static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *ref_ptr,
                                             int ref_stride, int h,
@@ -443,7 +443,7 @@ static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *ref_ptr,
                                             int ref_stride, int h,
@@ -474,7 +474,7 @@ static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *ref_ptr,
                                             int ref_stride, int h,
@@ -499,7 +499,7 @@ static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
                                            int ref_stride, int h,
@@ -523,7 +523,7 @@ static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+static inline unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
                                            int ref_stride, int h,
@@ -588,7 +588,7 @@ SAD_WXH_AVG_NEON(64, 16)
 
 #undef SAD_WXH_AVG_NEON
 
-static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+static inline unsigned int dist_wtd_sad128xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -675,7 +675,7 @@ static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+static inline unsigned int dist_wtd_sad64xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -726,7 +726,7 @@ static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
   return horizontal_add_u32x4(sum_u32);
 }
 
-static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+static inline unsigned int dist_wtd_sad32xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -757,7 +757,7 @@ static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
   return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
-static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+static inline unsigned int dist_wtd_sad16xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -782,7 +782,7 @@ static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+static inline unsigned int dist_wtd_sad8xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -806,7 +806,7 @@ static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
   return horizontal_add_u16x8(sum);
 }
 
-static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+static inline unsigned int dist_wtd_sad4xh_avg_neon(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
index d2bc0cc872..c24db420ae 100644
--- a/aom_dsp/arm/sad_neon_dotprod.c
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
                                                int src_stride,
                                                const uint8_t *ref_ptr,
                                                int ref_stride, int w, int h) {
@@ -53,28 +53,28 @@ static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
                                                  int src_stride,
                                                  const uint8_t *ref_ptr,
                                                  int ref_stride, int h) {
   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
 }
 
-static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
 }
 
-static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
 }
 
-static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
                                                 int ref_stride, int h) {
@@ -167,7 +167,7 @@ SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
 
 #undef SAD_SKIP_WXH_NEON_DOTPROD
 
-static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
                                                    int src_stride,
                                                    const uint8_t *ref_ptr,
                                                    int ref_stride, int w, int h,
@@ -207,28 +207,28 @@ static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
 }
 
-static INLINE unsigned int sad128xh_avg_neon_dotprod(
+static inline unsigned int sad128xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred) {
   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
                                  h, second_pred);
 }
 
-static INLINE unsigned int sad64xh_avg_neon_dotprod(
+static inline unsigned int sad64xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred) {
   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
                                  h, second_pred);
 }
 
-static INLINE unsigned int sad32xh_avg_neon_dotprod(
+static inline unsigned int sad32xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred) {
   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
                                  h, second_pred);
 }
 
-static INLINE unsigned int sad16xh_avg_neon_dotprod(
+static inline unsigned int sad16xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred) {
   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -295,7 +295,7 @@ SAD_WXH_AVG_NEON_DOTPROD(64, 16)
 
 #undef SAD_WXH_AVG_NEON_DOTPROD
 
-static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+static inline unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -379,7 +379,7 @@ static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+static inline unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -428,7 +428,7 @@ static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+static inline unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
@@ -460,7 +460,7 @@ static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
   return horizontal_add_u32x4(sum[0]);
 }
 
-static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+static inline unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
     int ref_stride, int h, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
diff --git a/aom_dsp/arm/sadxd_neon.c b/aom_dsp/arm/sadxd_neon.c
index 69f408456c..b528281363 100644
--- a/aom_dsp/arm/sadxd_neon.c
+++ b/aom_dsp/arm/sadxd_neon.c
@@ -18,13 +18,13 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint16x8_t *const sad_sum) {
   uint8x16_t abs_diff = vabdq_u8(src, ref);
   *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
 }
 
-static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
+static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
                                         const uint8_t *const ref[3],
                                         int ref_stride, uint32_t res[3], int w,
                                         int h, int h_overflow) {
@@ -72,19 +72,19 @@ static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
   res[2] = horizontal_add_u32x4(sum[2]);
 }
 
-static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *const ref[3], int ref_stride,
                                     uint32_t res[3], int h) {
   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
 }
 
-static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[3], int ref_stride,
                                    uint32_t res[3], int h) {
   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
 }
 
-static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[3], int ref_stride,
                                    uint32_t res[3], int h) {
   uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
@@ -112,7 +112,7 @@ static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
   res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
 }
 
-static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[3], int ref_stride,
                                    uint32_t res[3], int h) {
   uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
@@ -134,7 +134,7 @@ static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
   res[2] = horizontal_add_u16x8(sum[2]);
 }
 
-static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[3], int ref_stride,
                                   uint32_t res[3], int h) {
   uint16x8_t sum[3];
@@ -162,7 +162,7 @@ static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
   res[2] = horizontal_add_u16x8(sum[2]);
 }
 
-static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride,
+static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[3], int ref_stride,
                                   uint32_t res[3], int h) {
   assert(h % 2 == 0);
@@ -239,7 +239,7 @@ SAD_WXH_3D_NEON(64, 16)
 
 #undef SAD_WXH_3D_NEON
 
-static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
                                         const uint8_t *const ref[4],
                                         int ref_stride, uint32_t res[4], int w,
                                         int h, int h_overflow) {
@@ -292,19 +292,19 @@ static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *const ref[4], int ref_stride,
                                     uint32_t res[4], int h) {
   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
 }
 
-static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
 }
 
-static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
   uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -334,7 +334,7 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
 }
 
-static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *const ref[4], int ref_stride,
                                    uint32_t res[4], int h) {
   uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -362,7 +362,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
 }
 
-static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[4], int ref_stride,
                                   uint32_t res[4], int h) {
   uint16x8_t sum[4];
@@ -390,7 +390,7 @@ static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[4], int ref_stride,
                                   uint32_t res[4], int h) {
   uint16x8_t sum[4];
diff --git a/aom_dsp/arm/sadxd_neon_dotprod.c b/aom_dsp/arm/sadxd_neon_dotprod.c
index 4f9d408847..317e7ce897 100644
--- a/aom_dsp/arm/sadxd_neon_dotprod.c
+++ b/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -18,13 +18,13 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint32x4_t *const sad_sum) {
   uint8x16_t abs_diff = vabdq_u8(src, ref);
   *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
 }
 
-static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+static inline void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
                                                 int src_stride,
                                                 const uint8_t *const ref[4],
                                                 int ref_stride, uint32_t res[4],
@@ -59,28 +59,28 @@ static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
   res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
 }
 
-static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
                                             const uint8_t *const ref[4],
                                             int ref_stride, uint32_t res[4],
                                             int h) {
   sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
 }
 
-static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
   sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
 }
 
-static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
   sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
 }
 
-static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
@@ -134,7 +134,7 @@ SAD_WXH_3D_NEON_DOTPROD(64, 16)
 
 #undef SAD_WXH_3D_NEON_DOTPROD
 
-static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+static inline void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
                                                 int src_stride,
                                                 const uint8_t *const ref[4],
                                                 int ref_stride, uint32_t res[4],
@@ -177,28 +177,28 @@ static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
 }
 
-static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
                                             const uint8_t *const ref[4],
                                             int ref_stride, uint32_t res[4],
                                             int h) {
   sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
 }
 
-static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
   sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
 }
 
-static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
   sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
 }
 
-static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
                                            const uint8_t *const ref[4],
                                            int ref_stride, uint32_t res[4],
                                            int h) {
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index b0ed8330f7..54e976c2b1 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -15,7 +15,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+static inline void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
                                  uint32x4_t *sse) {
   uint8x16_t s = vld1q_u8(src);
   uint8x16_t r = vld1q_u8(ref);
@@ -28,7 +28,7 @@ static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
   *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
 }
 
-static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+static inline void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
                                 uint32x4_t *sse) {
   uint8x8_t s = vld1_u8(src);
   uint8x8_t r = vld1_u8(ref);
@@ -38,7 +38,7 @@ static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
   *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
 }
 
-static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+static inline void sse_4x2_neon(const uint8_t *src, int src_stride,
                                 const uint8_t *ref, int ref_stride,
                                 uint32x4_t *sse) {
   uint8x8_t s = load_unaligned_u8(src, src_stride);
@@ -49,7 +49,7 @@ static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
   *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
 }
 
-static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int width, int height) {
   uint32x4_t sse = vdupq_n_u32(0);
@@ -85,7 +85,7 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(sse);
 }
 
-static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride,
                                       int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -108,7 +108,7 @@ static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -127,7 +127,7 @@ static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -144,7 +144,7 @@ static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride,
                                      int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -163,7 +163,7 @@ static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int height) {
   uint32x4_t sse = vdupq_n_u32(0);
@@ -179,7 +179,7 @@ static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(sse);
 }
 
-static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+static inline uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int height) {
   uint32x4_t sse = vdupq_n_u32(0);
diff --git a/aom_dsp/arm/sse_neon_dotprod.c b/aom_dsp/arm/sse_neon_dotprod.c
index f9562fc930..92f6921bdf 100644
--- a/aom_dsp/arm/sse_neon_dotprod.c
+++ b/aom_dsp/arm/sse_neon_dotprod.c
@@ -15,7 +15,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+static inline void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
                                          uint32x4_t *sse) {
   uint8x16_t s = vld1q_u8(src);
   uint8x16_t r = vld1q_u8(ref);
@@ -25,7 +25,7 @@ static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
   *sse = vdotq_u32(*sse, abs_diff, abs_diff);
 }
 
-static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+static inline void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
                                         uint32x2_t *sse) {
   uint8x8_t s = vld1_u8(src);
   uint8x8_t r = vld1_u8(ref);
@@ -35,7 +35,7 @@ static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
   *sse = vdot_u32(*sse, abs_diff, abs_diff);
 }
 
-static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
                                         const uint8_t *ref, int ref_stride,
                                         uint32x2_t *sse) {
   uint8x8_t s = load_unaligned_u8(src, src_stride);
@@ -46,7 +46,7 @@ static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
   *sse = vdot_u32(*sse, abs_diff, abs_diff);
 }
 
-static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             int width, int height) {
   uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
@@ -86,7 +86,7 @@ static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+static inline uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
                                               int src_stride,
                                               const uint8_t *ref,
                                               int ref_stride, int height) {
@@ -110,7 +110,7 @@ static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -129,7 +129,7 @@ static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -146,7 +146,7 @@ static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int height) {
   uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
@@ -165,7 +165,7 @@ static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             int height) {
   uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
@@ -184,7 +184,7 @@ static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
 }
 
-static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             int height) {
   uint32x2_t sse = vdup_n_u32(0);
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index a497979574..7ae126e0b2 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -18,7 +18,7 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
+static inline int horizontal_add_u8x8(const uint8x8_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlv_u8(a);
 #else
@@ -28,7 +28,7 @@ static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
 #endif
 }
 
-static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+static inline int horizontal_add_s16x8(const int16x8_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_s16(a);
 #else
@@ -40,7 +40,7 @@ static INLINE int horizontal_add_s16x8(const int16x8_t a) {
 #endif
 }
 
-static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+static inline int horizontal_add_s32x4(const int32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_s32(a);
 #else
@@ -51,7 +51,7 @@ static INLINE int horizontal_add_s32x4(const int32x4_t a) {
 #endif
 }
 
-static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
+static inline int64_t horizontal_add_s64x2(const int64x2_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_s64(a);
 #else
@@ -59,7 +59,7 @@ static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
 #endif
 }
 
-static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
+static inline uint64_t horizontal_add_u64x2(const uint64x2_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_u64(a);
 #else
@@ -67,7 +67,7 @@ static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
 #endif
 }
 
-static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
+static inline uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_u32(a);
 #else
@@ -76,7 +76,7 @@ static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
 #endif
 }
 
-static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+static inline int64_t horizontal_long_add_s32x4(const int32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_s32(a);
 #else
@@ -85,7 +85,7 @@ static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
 #endif
 }
 
-static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
+static inline uint32_t horizontal_add_u32x4(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
@@ -96,7 +96,7 @@ static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
 #endif
 }
 
-static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
+static inline uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
 #if AOM_ARCH_AARCH64
   uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
   uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
@@ -111,7 +111,7 @@ static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
 #endif
 }
 
-static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+static inline int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
 #if AOM_ARCH_AARCH64
   int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
   int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
@@ -126,7 +126,7 @@ static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
 #endif
 }
 
-static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+static inline uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
                                                  const uint16x8_t vec_hi) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
@@ -143,7 +143,7 @@ static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
 #endif
 }
 
-static INLINE uint32x4_t horizontal_long_add_4d_u16x8(
+static inline uint32x4_t horizontal_long_add_4d_u16x8(
     const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
   const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
@@ -168,7 +168,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_u16x8(
 #endif
 }
 
-static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+static inline uint32_t horizontal_add_u16x8(const uint16x8_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_u16(a);
 #else
@@ -180,7 +180,7 @@ static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
 #endif
 }
 
-static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+static inline uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
 #if AOM_ARCH_AARCH64
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
@@ -197,7 +197,7 @@ static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
 #endif
 }
 
-static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
+static inline int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
 #if AOM_ARCH_AARCH64
   const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]);
   const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]);
@@ -214,7 +214,7 @@ static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
 #endif
 }
 
-static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
+static inline uint32_t horizontal_add_u32x2(const uint32x2_t a) {
 #if AOM_ARCH_AARCH64
   return vaddv_u32(a);
 #else
@@ -223,7 +223,7 @@ static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
 #endif
 }
 
-static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
+static inline uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlv_u32(a);
 #else
@@ -232,7 +232,7 @@ static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
 #endif
 }
 
-static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddlv_u16(a);
 #else
@@ -242,7 +242,7 @@ static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
 #endif
 }
 
-static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
 #if AOM_ARCH_AARCH64
   return vpaddq_s32(a, b);
 #else
@@ -252,7 +252,7 @@ static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
 #endif
 }
 
-static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+static inline int32x2_t add_pairwise_s32x4(int32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vget_low_s32(vpaddq_s32(a, a));
 #else
@@ -260,11 +260,11 @@ static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
 #endif
 }
 
-static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+static inline uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
   return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
 }
 
-static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+static inline uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
   uint64x2_t sum = vpaddlq_u32(a[0]);
   sum = vpadalq_u32(sum, a[1]);
   sum = vpadalq_u32(sum, a[2]);
@@ -273,7 +273,7 @@ static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
   return horizontal_add_u64x2(sum);
 }
 
-static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+static inline uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
   uint64x2_t sum[2];
   sum[0] = vpaddlq_u32(a[0]);
   sum[1] = vpaddlq_u32(a[1]);
@@ -287,7 +287,7 @@ static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
   return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
 }
 
-static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+static inline uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
   uint64x2_t sum[2];
   sum[0] = vpaddlq_u32(a[0]);
   sum[1] = vpaddlq_u32(a[1]);
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 2f5e4b9cad..a02faf8f7a 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -16,7 +16,7 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
                                                        int stride) {
   int16x4_t s0 = vld1_s16(src + 0 * stride);
   int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -31,7 +31,7 @@ static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
   return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares));
 }
 
-static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
                                                        int stride, int height) {
   int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
@@ -55,7 +55,7 @@ static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
       vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
 }
 
-static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
                                                        int stride, int width,
                                                        int height) {
   uint64x2_t sum_squares = vdupq_n_u64(0);
@@ -109,7 +109,7 @@ uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
   }
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
                                                    int stride, int *sum) {
   int16x4_t s0 = vld1_s16(src + 0 * stride);
   int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -128,7 +128,7 @@ static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
   return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse));
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
                                                    int stride, int height,
                                                    int *sum) {
   int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
@@ -160,7 +160,7 @@ static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
       vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1])));
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
                                                    int stride, int width,
                                                    int height, int *sum) {
   uint64x2_t sse = vdupq_n_u64(0);
@@ -224,7 +224,7 @@ uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width,
   return sse;
 }
 
-static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
+static inline uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
                                                     uint32_t n) {
   uint64x2_t sum_u64 = vdupq_n_u64(0);
 
@@ -247,7 +247,7 @@ static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
   return horizontal_add_u64x2(sum_u64);
 }
 
-static INLINE uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src,
+static inline uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src,
                                                     uint32_t n) {
   uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
 
@@ -287,7 +287,7 @@ uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) {
   return aom_sum_squares_i16_c(src, n);
 }
 
-static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
                                               int width, int height) {
   uint64_t sum = 0;
   uint64_t sse = 0;
@@ -344,7 +344,7 @@ static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
                                               int width, int height) {
   uint64_t sum = 0;
   uint64_t sse = 0;
@@ -401,7 +401,7 @@ static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
                                                int width, int height) {
   uint64_t sum = 0;
   uint64_t sse = 0;
@@ -474,7 +474,7 @@ uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
   return aom_var_2d_u8_c(src, src_stride, width, height);
 }
 
-static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
                                                int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
@@ -517,7 +517,7 @@ static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride,
                                                int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
diff --git a/aom_dsp/arm/sum_squares_neon_dotprod.c b/aom_dsp/arm/sum_squares_neon_dotprod.c
index 2f6d1a566a..24f329afd4 100644
--- a/aom_dsp/arm/sum_squares_neon_dotprod.c
+++ b/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -16,7 +16,7 @@
 #include "aom_dsp/arm/sum_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+static inline uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
                                                       int src_stride, int width,
                                                       int height) {
   uint64_t sum = 0;
@@ -57,7 +57,7 @@ static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+static inline uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
                                                       int src_stride, int width,
                                                       int height) {
   uint64_t sum = 0;
@@ -98,7 +98,7 @@ static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+static inline uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
                                                        int src_stride,
                                                        int width, int height) {
   uint64_t sum = 0;
diff --git a/aom_dsp/arm/sum_squares_sve.c b/aom_dsp/arm/sum_squares_sve.c
index 0d132dbd0d..a9734f6331 100644
--- a/aom_dsp/arm/sum_squares_sve.c
+++ b/aom_dsp/arm/sum_squares_sve.c
@@ -15,7 +15,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
                                                       int stride, int height) {
   int64x2_t sum_squares = vdupq_n_s64(0);
 
@@ -31,7 +31,7 @@ static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
   return (uint64_t)vaddvq_s64(sum_squares);
 }
 
-static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
                                                       int stride, int height) {
   int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
 
@@ -50,7 +50,7 @@ static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
   return (uint64_t)vaddvq_s64(sum_squares[0]);
 }
 
-static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
                                                         int stride, int width,
                                                         int height) {
   int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
@@ -76,7 +76,7 @@ static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
   return (uint64_t)vaddvq_s64(sum_squares[0]);
 }
 
-static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
+static inline uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
                                                       int stride, int width,
                                                       int height) {
   svint64_t sum_squares = svdup_n_s64(0);
@@ -147,7 +147,7 @@ uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) {
   return aom_sum_squares_i16_c(src, n);
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
                                                   int stride, int height,
                                                   int *sum) {
   int64x2_t sse = vdupq_n_s64(0);
@@ -168,7 +168,7 @@ static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
   return vaddvq_s64(sse);
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
                                                   int stride, int height,
                                                   int *sum) {
   int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
@@ -192,7 +192,7 @@ static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
   return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
 }
 
-static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
+static inline uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
                                                    int stride, int width,
                                                    int height, int *sum) {
   int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
@@ -237,7 +237,7 @@ uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width,
   return sse;
 }
 
-static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
                                               int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
@@ -264,7 +264,7 @@ static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
                                               int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
@@ -296,7 +296,7 @@ static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
                                                int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
@@ -334,7 +334,7 @@ static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
   return sse - sum * sum / (width * height);
 }
 
-static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
+static inline uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
                                                 int width, int height) {
   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
   uint64_t sum = 0;
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 5ac287fce7..5d0804ec19 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -17,7 +17,7 @@
 #include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
 #include "config/aom_config.h"
 
-static INLINE void transpose_elems_u8_8x8(
+static inline void transpose_elems_u8_8x8(
     uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
     uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
     uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
@@ -71,7 +71,7 @@ static INLINE void transpose_elems_u8_8x8(
   *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
-static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
                                                   uint8x8_t *a2, uint8x8_t *a3,
                                                   uint8x8_t *a4, uint8x8_t *a5,
                                                   uint8x8_t *a6,
@@ -80,7 +80,7 @@ static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
                          a4, a5, a6, a7);
 }
 
-static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
+static inline void transpose_arrays_u8_8x8(const uint8x8_t *in,
                                            uint8x8_t *out) {
   transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
                          &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
@@ -189,7 +189,7 @@ static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
   d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
 }
 
-static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
 #if AOM_ARCH_AARCH64
   b0.val[0] = vreinterpretq_u16_u64(
@@ -205,7 +205,7 @@ static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   return b0;
 }
 
-static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
+static inline void transpose_arrays_u8_16x16(const uint8x16_t *x,
                                              uint8x16_t *d) {
   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
@@ -292,7 +292,7 @@ static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
   transpose_arrays_u8_16x16(x2 + 16, d + 16);
 }
 
-static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
                                                   uint8x8_t *a2,
                                                   uint8x8_t *a3) {
   // Swap 8 bit elements. Goes from:
@@ -326,7 +326,7 @@ static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
   *a3 = vreinterpret_u8_u16(c1.val[1]);
 }
 
-static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
+static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
                                                    uint8x16_t *a1,
                                                    uint8x16_t *a2,
                                                    uint8x16_t *a3) {
@@ -361,7 +361,7 @@ static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
   *a3 = vreinterpretq_u8_u16(c1.val[1]);
 }
 
-static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
                                                   uint8x8_t *a1) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03  10 11 12 13
@@ -391,7 +391,7 @@ static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
   *a1 = d0.val[1];
 }
 
-static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
                                           uint8x8_t a2, uint8x8_t a3,
                                           uint8x8_t a4, uint8x8_t a5,
                                           uint8x8_t a6, uint8x8_t a7,
@@ -449,7 +449,7 @@ static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
   *o3 = d1.val[1];
 }
 
-static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
   // Input:
   // 00 01 02 03
   // 10 11 12 13
@@ -486,7 +486,7 @@ static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
   a[3] = vreinterpret_u16_u32(e.val[1]);
 }
 
-static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
   // 4x8 Input:
   // a[0]: 00 01 02 03 04 05 06 07
   // a[1]: 10 11 12 13 14 15 16 17
@@ -545,7 +545,7 @@ static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
 // order of the low halves also restored relative to the high halves. This is
 // preferable because it puts all values from the same source row back together,
 // but some post-processing is inevitable.
-static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
+static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
   // b0.val[0]: 00 10 02 12 04 14 06 16
   // b0.val[1]: 01 11 03 13 05 15 07 17
   // b1.val[0]: 20 30 22 32 24 34 26 36
@@ -589,7 +589,7 @@ static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
   a[3] = d0.val[0];  // p3q3
 }
 
-static INLINE void transpose_elems_u16_4x8(
+static inline void transpose_elems_u16_4x8(
     const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
     const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
     const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
@@ -640,7 +640,7 @@ static INLINE void transpose_elems_u16_4x8(
   *o3 = vreinterpretq_u16_u32(d1.val[1]);
 }
 
-static INLINE void transpose_elems_s16_4x8(
+static inline void transpose_elems_s16_4x8(
     const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
     const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
     const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
@@ -691,7 +691,7 @@ static INLINE void transpose_elems_s16_4x8(
   *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
-static INLINE void transpose_elems_inplace_u16_8x8(
+static inline void transpose_elems_inplace_u16_8x8(
     uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
     uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
   // Swap 16 bit elements. Goes from:
@@ -762,7 +762,7 @@ static INLINE void transpose_elems_inplace_u16_8x8(
   *a7 = d3.val[1];
 }
 
-static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
 #if AOM_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s16_s64(
@@ -778,7 +778,7 @@ static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
-static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                                    int16x8_t *a2, int16x8_t *a3,
                                                    int16x8_t *a4, int16x8_t *a5,
                                                    int16x8_t *a6,
@@ -851,7 +851,7 @@ static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   *a7 = d3.val[1];
 }
 
-static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+static inline void transpose_arrays_s16_8x8(const int16x8_t *a,
                                             int16x8_t *out) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
@@ -921,7 +921,7 @@ static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
   out[7] = d3.val[1];
 }
 
-static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
+static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
                                                    int16x8_t *a2,
                                                    int16x8_t *a3) {
   // Swap 16 bit elements. Goes from:
@@ -955,7 +955,7 @@ static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
   *a3 = vreinterpretq_s16_s32(c1.val[1]);
 }
 
-static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
                                                    uint16x4_t *a1,
                                                    uint16x4_t *a2,
                                                    uint16x4_t *a3) {
@@ -990,7 +990,7 @@ static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
   *a3 = vreinterpret_u16_u32(c1.val[1]);
 }
 
-static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
                                                    int16x4_t *a2,
                                                    int16x4_t *a3) {
   // Swap 16 bit elements. Goes from:
@@ -1024,7 +1024,7 @@ static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
   *a3 = vreinterpret_s16_s32(c1.val[1]);
 }
 
-static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
 #if AOM_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s32_s64(
@@ -1038,7 +1038,7 @@ static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
-static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+static inline void transpose_elems_s32_4x4(const int32x4_t a0,
                                            const int32x4_t a1,
                                            const int32x4_t a2,
                                            const int32x4_t a3, int32x4_t *o0,
@@ -1073,13 +1073,13 @@ static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
   *o3 = c1.val[1];
 }
 
-static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
                                                    int32x4_t *a2,
                                                    int32x4_t *a3) {
   transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
 }
 
-static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+static inline void transpose_arrays_s32_4x4(const int32x4_t *in,
                                             int32x4_t *out) {
   transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
                           &out[3]);
@@ -1124,7 +1124,7 @@ TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
 
 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
 
-static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
 #if AOM_ARCH_AARCH64
   return vtrn1q_s64(a, b);
 #else
@@ -1132,7 +1132,7 @@ static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
 #endif
 }
 
-static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
 #if AOM_ARCH_AARCH64
   return vtrn2q_s64(a, b);
 #else
@@ -1140,7 +1140,7 @@ static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
 #endif
 }
 
-static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
                                            int32x4_t a2, int32x4_t a3,
                                            int32x4_t a4, int32x4_t a5,
                                            int32x4_t a6, int32x4_t a7,
@@ -1165,7 +1165,7 @@ static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
   o3->val[1] = a7;
 }
 
-static INLINE void transpose_elems_inplace_s32_8x8(
+static inline void transpose_elems_inplace_s32_8x8(
     int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
     int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
   // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
@@ -1219,7 +1219,7 @@ static INLINE void transpose_elems_inplace_s32_8x8(
   a7->val[1] = q3_v4;
 }
 
-static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+static inline void transpose_arrays_s16_4x4(const int16x4_t *const in,
                                             int16x4_t *const out) {
   int16x4_t a0 = in[0];
   int16x4_t a1 = in[1];
@@ -1234,7 +1234,7 @@ static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
   out[3] = a3;
 }
 
-static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+static inline void transpose_arrays_s16_4x8(const int16x4_t *const in,
                                             int16x8_t *const out) {
 #if AOM_ARCH_AARCH64
   const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
@@ -1283,7 +1283,7 @@ static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
 #endif
 }
 
-static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+static inline void transpose_arrays_s16_8x4(const int16x8_t *const in,
                                             int16x4_t *const out) {
   // Swap 16 bit elements. Goes from:
   // in[0]: 00 01 02 03 04 05 06 07
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index b37e94d495..74524add01 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -18,7 +18,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_4xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride, int h,
                                      uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
@@ -48,7 +48,7 @@ static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
   *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
 }
 
-static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_8xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride, int h,
                                      uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
@@ -78,7 +78,7 @@ static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_16xh_neon(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride, int h,
                                       uint32_t *sse, int *sum) {
   int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
@@ -118,7 +118,7 @@ static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+static inline void variance_large_neon(const uint8_t *src, int src_stride,
                                        const uint8_t *ref, int ref_stride,
                                        int w, int h, int h_limit, uint32_t *sse,
                                        int *sum) {
@@ -174,19 +174,19 @@ static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_32xh_neon(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride, int h,
                                       uint32_t *sse, int *sum) {
   variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
 }
 
-static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_64xh_neon(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride, int h,
                                       uint32_t *sse, int *sum) {
   variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
 }
 
-static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+static inline void variance_128xh_neon(const uint8_t *src, int src_stride,
                                        const uint8_t *ref, int ref_stride,
                                        int h, uint32_t *sse, int *sum) {
   variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
@@ -275,7 +275,7 @@ void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
   }
 }
 
-static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
                                        const uint8_t *ref, int ref_stride,
                                        unsigned int *sse, int h) {
   uint8x8_t s[2], r[2];
@@ -316,7 +316,7 @@ static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
 }
 
-static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+static inline unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
                                         const uint8_t *ref, int ref_stride,
                                         unsigned int *sse, int h) {
   uint8x16_t s[2], r[2];
@@ -387,7 +387,7 @@ MSE_WXH_NEON(16, 16)
 
 #undef MSE_WXH_NEON
 
-static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+static inline uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
                                                    uint16x8_t s0, uint16x8_t s1,
                                                    uint8x8_t d0, uint8x8_t d1) {
   int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
diff --git a/aom_dsp/arm/variance_neon_dotprod.c b/aom_dsp/arm/variance_neon_dotprod.c
index 8addf0efe1..dd03443a7a 100644
--- a/aom_dsp/arm/variance_neon_dotprod.c
+++ b/aom_dsp/arm/variance_neon_dotprod.c
@@ -18,7 +18,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int h, uint32_t *sse, int *sum) {
   uint32x4_t src_sum = vdupq_n_u32(0);
@@ -47,7 +47,7 @@ static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
   *sse = horizontal_add_u32x4(sse_u32);
 }
 
-static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+static inline void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
                                              const uint8_t *ref, int ref_stride,
                                              int h, uint32_t *sse, int *sum) {
   uint32x4_t src_sum = vdupq_n_u32(0);
@@ -76,7 +76,7 @@ static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
   *sse = horizontal_add_u32x4(sse_u32);
 }
 
-static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+static inline void variance_16xh_neon_dotprod(const uint8_t *src,
                                               int src_stride,
                                               const uint8_t *ref,
                                               int ref_stride, int h,
@@ -106,7 +106,7 @@ static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
   *sse = horizontal_add_u32x4(sse_u32);
 }
 
-static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+static inline void variance_large_neon_dotprod(const uint8_t *src,
                                                int src_stride,
                                                const uint8_t *ref,
                                                int ref_stride, int w, int h,
@@ -141,7 +141,7 @@ static INLINE void variance_large_neon_dotprod(const uint8_t *src,
   *sse = horizontal_add_u32x4(sse_u32);
 }
 
-static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+static inline void variance_32xh_neon_dotprod(const uint8_t *src,
                                               int src_stride,
                                               const uint8_t *ref,
                                               int ref_stride, int h,
@@ -150,7 +150,7 @@ static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
                               sum);
 }
 
-static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+static inline void variance_64xh_neon_dotprod(const uint8_t *src,
                                               int src_stride,
                                               const uint8_t *ref,
                                               int ref_stride, int h,
@@ -159,7 +159,7 @@ static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
                               sum);
 }
 
-static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+static inline void variance_128xh_neon_dotprod(const uint8_t *src,
                                                int src_stride,
                                                const uint8_t *ref,
                                                int ref_stride, int h,
@@ -247,7 +247,7 @@ void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
   }
 }
 
-static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+static inline unsigned int mse8xh_neon_dotprod(const uint8_t *src,
                                                int src_stride,
                                                const uint8_t *ref,
                                                int ref_stride,
@@ -272,7 +272,7 @@ static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
   return horizontal_add_u32x4(sse_u32);
 }
 
-static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+static inline unsigned int mse16xh_neon_dotprod(const uint8_t *src,
                                                 int src_stride,
                                                 const uint8_t *ref,
                                                 int ref_stride,
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index b27bc5913f..cd4716b04f 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -83,7 +83,7 @@ uint32_t aom_reader_tell(const aom_reader *r);
 uint32_t aom_reader_tell_frac(const aom_reader *r);
 
 #if CONFIG_ACCOUNTING
-static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+static inline void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
   if (r->accounting != NULL) {
     uint32_t tell_frac;
     tell_frac = aom_reader_tell_frac(r);
@@ -93,7 +93,7 @@ static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
   }
 }
 
-static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+static inline void aom_update_symb_counts(const aom_reader *r, int is_binary) {
   if (r->accounting != NULL) {
     r->accounting->syms.num_multi_syms += !is_binary;
     r->accounting->syms.num_binary_syms += !!is_binary;
@@ -101,7 +101,7 @@ static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
 }
 #endif
 
-static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
   int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
   int bit = od_ec_decode_bool_q15(&r->ec, p);
 
@@ -146,7 +146,7 @@ static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
   return bit;
 }
 
-static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
   int ret;
   ret = aom_read(r, 128, NULL);  // aom_prob_half
 #if CONFIG_ACCOUNTING
@@ -155,7 +155,7 @@ static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
   return ret;
 }
 
-static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
   int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
@@ -165,7 +165,7 @@ static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
   return literal;
 }
 
-static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+static inline int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
                                 int nsymbs ACCT_STR_PARAM) {
   int symb;
   assert(cdf != NULL);
@@ -217,7 +217,7 @@ static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
   return symb;
 }
 
-static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+static inline int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                    int nsymbs ACCT_STR_PARAM) {
   int ret;
   ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 3c566889a4..5cb26169e9 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -48,7 +48,7 @@ typedef struct TOKEN_STATS {
 #endif
 } TOKEN_STATS;
 
-static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+static inline void init_token_stats(TOKEN_STATS *token_stats) {
 #if CONFIG_RD_DEBUG
   int r, c;
   for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
@@ -68,7 +68,7 @@ int aom_stop_encode(aom_writer *w);
 
 int aom_tell_size(aom_writer *w);
 
-static INLINE void aom_write(aom_writer *w, int bit, int probability) {
+static inline void aom_write(aom_writer *w, int bit, int probability) {
   int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
 #if CONFIG_BITSTREAM_DEBUG
   aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
@@ -78,17 +78,17 @@ static INLINE void aom_write(aom_writer *w, int bit, int probability) {
   od_ec_encode_bool_q15(&w->ec, bit, p);
 }
 
-static INLINE void aom_write_bit(aom_writer *w, int bit) {
+static inline void aom_write_bit(aom_writer *w, int bit) {
   aom_write(w, bit, 128);  // aom_prob_half
 }
 
-static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+static inline void aom_write_literal(aom_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
 }
 
-static INLINE void aom_write_cdf(aom_writer *w, int symb,
+static inline void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_push(symb, cdf, nsymbs);
@@ -97,7 +97,7 @@ static INLINE void aom_write_cdf(aom_writer *w, int symb,
   od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
 }
 
-static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+static inline void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                     int nsymbs) {
   aom_write_cdf(w, symb, cdf, nsymbs);
   if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
index cd3d2f77eb..9787dc0d71 100644
--- a/aom_dsp/fft.c
+++ b/aom_dsp/fft.c
@@ -13,7 +13,7 @@
 #include "aom_dsp/fft_common.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void simple_transpose(const float *A, float *B, int n) {
+static inline void simple_transpose(const float *A, float *B, int n) {
   for (int y = 0; y < n; y++) {
     for (int x = 0; x < n; x++) {
       B[y * n + x] = A[x * n + y];
@@ -31,7 +31,7 @@ static INLINE void simple_transpose(const float *A, float *B, int n) {
 // imaginary part of transformed imaginary columns. This function assembles
 // the correct outputs while putting the real and imaginary components
 // next to each other.
-static INLINE void unpack_2d_output(const float *col_fft, float *output,
+static inline void unpack_2d_output(const float *col_fft, float *output,
                                     int n) {
   for (int y = 0; y <= n / 2; ++y) {
     const int y2 = y + n / 2;
@@ -72,10 +72,10 @@ void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
   unpack(temp, output, n);
 }
 
-static INLINE void store_float(float *output, float input) { *output = input; }
-static INLINE float add_float(float a, float b) { return a + b; }
-static INLINE float sub_float(float a, float b) { return a - b; }
-static INLINE float mul_float(float a, float b) { return a * b; }
+static inline void store_float(float *output, float input) { *output = input; }
+static inline float add_float(float a, float b) { return a + b; }
+static inline float sub_float(float a, float b) { return a - b; }
+static inline float mul_float(float a, float b) { return a * b; }
 
 GEN_FFT_2(void, float, float, float, *, store_float)
 GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c
index e539e76322..d6c0e0b050 100644
--- a/aom_dsp/flow_estimation/arm/disflow_neon.c
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -24,7 +24,7 @@
 // (x, y) in src and the other at (x + u, y + v) in ref.
 // This function returns the sum of squared pixel differences between
 // the two regions.
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+static inline void compute_flow_error(const uint8_t *src, const uint8_t *ref,
                                       int width, int height, int stride, int x,
                                       int y, double u, double v, int16_t *dt) {
   // Split offset into integer and fractional parts, and compute cubic
@@ -160,7 +160,7 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
 //
 //   b = |sum(dx * dt)|
 //       |sum(dy * dt)|
-static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+static inline void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M_inv) {
   int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
@@ -208,7 +208,7 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
   M_inv[3] = M0 * det_inv;
 }
 
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+static inline void compute_flow_vector(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        const int16_t *dt, int dt_stride,
                                        int *b) {
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.h b/aom_dsp/flow_estimation/arm/disflow_neon.h
index 80827da66d..5b0fb8cbfe 100644
--- a/aom_dsp/flow_estimation/arm/disflow_neon.h
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.h
@@ -21,7 +21,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+static inline void get_cubic_kernel_dbl(double x, double kernel[4]) {
   // Check that the fractional position is in range.
   //
   // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
@@ -38,7 +38,7 @@ static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
   kernel[3] = -0.5 * x2 + 0.5 * x3;
 }
 
-static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+static inline void get_cubic_kernel_int(double x, int kernel[4]) {
   double kernel_dbl[4];
   get_cubic_kernel_dbl(x, kernel_dbl);
 
@@ -48,7 +48,7 @@ static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
   kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
 }
 
-static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+static inline void sobel_filter_x(const uint8_t *src, int src_stride,
                                   int16_t *dst, int dst_stride) {
   int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
 
@@ -87,7 +87,7 @@ static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
   }
 }
 
-static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+static inline void sobel_filter_y(const uint8_t *src, int src_stride,
                                   int16_t *dst, int dst_stride) {
   int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
 
diff --git a/aom_dsp/flow_estimation/arm/disflow_sve.c b/aom_dsp/flow_estimation/arm/disflow_sve.c
index f399843b16..c2160a02a9 100644
--- a/aom_dsp/flow_estimation/arm/disflow_sve.c
+++ b/aom_dsp/flow_estimation/arm/disflow_sve.c
@@ -30,7 +30,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 // (x, y) in src and the other at (x + u, y + v) in ref.
 // This function returns the sum of squared pixel differences between
 // the two regions.
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+static inline void compute_flow_error(const uint8_t *src, const uint8_t *ref,
                                       int width, int height, int stride, int x,
                                       int y, double u, double v, int16_t *dt) {
   // Split offset into integer and fractional parts, and compute cubic
@@ -169,7 +169,7 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
 //
 //   b = |sum(dx * dt)|
 //       |sum(dy * dt)|
-static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+static inline void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M_inv) {
   int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) };
@@ -213,7 +213,7 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
   M_inv[3] = M0 * det_inv;
 }
 
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+static inline void compute_flow_vector(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        const int16_t *dt, int dt_stride,
                                        int *b) {
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c
index 1e5a675654..a1d74914ab 100644
--- a/aom_dsp/flow_estimation/disflow.c
+++ b/aom_dsp/flow_estimation/disflow.c
@@ -63,7 +63,7 @@ static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = {
   { -9 / 128., 111 / 128., 29 / 128., -3 / 128. }
 };
 
-static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+static inline void get_cubic_kernel_dbl(double x, double kernel[4]) {
   // Check that the fractional position is in range.
   //
   // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
@@ -80,7 +80,7 @@ static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
   kernel[3] = -0.5 * x2 + 0.5 * x3;
 }
 
-static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+static inline void get_cubic_kernel_int(double x, int kernel[4]) {
   double kernel_dbl[4];
   get_cubic_kernel_dbl(x, kernel_dbl);
 
@@ -90,18 +90,18 @@ static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
   kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
 }
 
-static INLINE double get_cubic_value_dbl(const double *p,
+static inline double get_cubic_value_dbl(const double *p,
                                          const double kernel[4]) {
   return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
          kernel[3] * p[3];
 }
 
-static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) {
+static inline int get_cubic_value_int(const int *p, const int kernel[4]) {
   return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
          kernel[3] * p[3];
 }
 
-static INLINE double bicubic_interp_one(const double *arr, int stride,
+static inline double bicubic_interp_one(const double *arr, int stride,
                                         const double h_kernel[4],
                                         const double v_kernel[4]) {
   double tmp[1 * 4];
@@ -191,7 +191,7 @@ static int determine_disflow_correspondence(const ImagePyramid *src_pyr,
 // (x, y) in src and the other at (x + u, y + v) in ref.
 // This function returns the sum of squared pixel differences between
 // the two regions.
-static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
                                        int width, int height, int stride, int x,
                                        int y, double u, double v,
                                        const int16_t *dx, const int16_t *dy,
@@ -278,7 +278,7 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
   }
 }
 
-static INLINE void sobel_filter(const uint8_t *src, int src_stride,
+static inline void sobel_filter(const uint8_t *src, int src_stride,
                                 int16_t *dst, int dst_stride, int dir) {
   int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
   int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
@@ -359,7 +359,7 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride,
 //
 //   b = |sum(dx * dt)|
 //       |sum(dy * dt)|
-static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+static inline void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M) {
   int tmp[4] = { 0 };
@@ -399,7 +399,7 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
 // The regularization term `+ k * I` further ensures that det M >= k^2.
 // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
 // So we don't have to worry about non-invertible matrices here.
-static INLINE void invert_2x2(const double *M, double *M_inv) {
+static inline void invert_2x2(const double *M, double *M_inv) {
   double det = (M[0] * M[3]) - (M[1] * M[2]);
   assert(det >= 1);
   const double det_inv = 1 / det;
diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c
index 7806100ebd..b93acdd418 100644
--- a/aom_dsp/flow_estimation/x86/disflow_avx2.c
+++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -30,7 +30,7 @@
 // This is chosen because it takes less work than fully separating the kernels,
 // but it is separated enough that we can pick out each coefficient pair in the
 // main compute_flow_at_point function
-static INLINE __m128i compute_cubic_kernels(double u, double v) {
+static inline __m128i compute_cubic_kernels(double u, double v) {
   const __m128d x = _mm_set_pd(v, u);
 
   const __m128d x2 = _mm_mul_pd(x, x);
@@ -81,7 +81,7 @@ static INLINE __m128i compute_cubic_kernels(double u, double v) {
 //
 // TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
 // instad of bicubic interpolation
-static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
                                        int width, int height, int stride, int x,
                                        int y, double u, double v,
                                        const int16_t *dx, const int16_t *dy,
@@ -267,7 +267,7 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
 
 // Compute the x and y gradients of the source patch in a single pass,
 // and store into dx and dy respectively.
-static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
+static inline void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
                                 int16_t *dy) {
   const __m256i zero = _mm256_setzero_si256();
 
@@ -328,7 +328,7 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
   }
 }
 
-static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+static inline void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M) {
   __m256i acc[4] = { 0 };
@@ -371,7 +371,7 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
 // The regularization term `+ k * I` further ensures that det M >= k^2.
 // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
 // So we don't have to worry about non-invertible matrices here.
-static INLINE void invert_2x2(const double *M, double *M_inv) {
+static inline void invert_2x2(const double *M, double *M_inv) {
   double det = (M[0] * M[3]) - (M[1] * M[2]);
   assert(det >= 1);
   const double det_inv = 1 / det;
diff --git a/aom_dsp/flow_estimation/x86/disflow_sse4.c b/aom_dsp/flow_estimation/x86/disflow_sse4.c
index 3743b88dfc..ffbe4858b5 100644
--- a/aom_dsp/flow_estimation/x86/disflow_sse4.c
+++ b/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -29,7 +29,7 @@
 // This is chosen because it takes less work than fully separating the kernels,
 // but it is separated enough that we can pick out each coefficient pair in the
 // main compute_flow_at_point function
-static INLINE __m128i compute_cubic_kernels(double u, double v) {
+static inline __m128i compute_cubic_kernels(double u, double v) {
   const __m128d x = _mm_set_pd(v, u);
 
   const __m128d x2 = _mm_mul_pd(x, x);
@@ -80,7 +80,7 @@ static INLINE __m128i compute_cubic_kernels(double u, double v) {
 //
 // TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
 // instad of bicubic interpolation
-static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
                                        int width, int height, int stride, int x,
                                        int y, double u, double v,
                                        const int16_t *dx, const int16_t *dy,
@@ -245,7 +245,7 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
 
 // Compute the x and y gradients of the source patch in a single pass,
 // and store into dx and dy respectively.
-static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
+static inline void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
                                 int16_t *dy) {
   // Loop setup: Load the first two rows (of 10 input rows) and apply
   // the horizontal parts of the two filters
@@ -304,7 +304,7 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
   }
 }
 
-static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+static inline void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M) {
   __m128i acc[4] = { 0 };
@@ -346,7 +346,7 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
 // The regularization term `+ k * I` further ensures that det M >= k^2.
 // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
 // So we don't have to worry about non-invertible matrices here.
-static INLINE void invert_2x2(const double *M, double *M_inv) {
+static inline void invert_2x2(const double *M, double *M_inv) {
   double det = (M[0] * M[3]) - (M[1] * M[2]);
   assert(det >= 1);
   const double det_inv = 1 / det;
diff --git a/aom_dsp/grain_params.h b/aom_dsp/grain_params.h
index 2a043f7b84..6f0c5a27f1 100644
--- a/aom_dsp/grain_params.h
+++ b/aom_dsp/grain_params.h
@@ -98,7 +98,7 @@ typedef struct {
  * \param[in]    pb               The second set of parameters to compare
  * \return       Returns 1 if the params are equivalent, 0 otherwise
  */
-static INLINE int aom_check_grain_params_equiv(
+static inline int aom_check_grain_params_equiv(
     const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
   if (pa->apply_grain != pb->apply_grain) return 0;
   // Don't compare update_parameters
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index a1c48ccb63..ad7a25cf03 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -20,7 +20,7 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+static inline void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                                const uint8_t *above, const uint8_t *left) {
   int r;
   (void)left;
@@ -31,7 +31,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+static inline void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                                const uint8_t *above, const uint8_t *left) {
   int r;
   (void)above;
@@ -42,9 +42,9 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
 
-static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+static inline uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
                                               uint16_t top_left) {
   const int base = top + left - top_left;
   const int p_left = abs_diff(base, left);
@@ -57,7 +57,7 @@ static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
                                                    : top_left;
 }
 
-static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint8_t *above,
                                    const uint8_t *left) {
   int r, c;
@@ -81,7 +81,7 @@ static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
 
 #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
 
-static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     int bh, const uint8_t *above,
                                     const uint8_t *left) {
   const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
@@ -112,7 +112,7 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
   const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
@@ -141,7 +141,7 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
@@ -170,7 +170,7 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     int bh, const uint8_t *above,
                                     const uint8_t *left) {
   int r;
@@ -183,7 +183,7 @@ static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                      int bh, const uint8_t *above,
                                      const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
@@ -198,7 +198,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     int bh, const uint8_t *above,
                                     const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
@@ -213,7 +213,7 @@ static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+static inline void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                                 const uint8_t *above, const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
   const int count = bw + bh;
@@ -233,7 +233,7 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-static INLINE int divide_using_multiply_shift(int num, int shift1,
+static inline int divide_using_multiply_shift(int num, int shift1,
                                               int multiplier, int shift2) {
   const int interm = num >> shift1;
   return interm * multiplier >> shift2;
@@ -262,7 +262,7 @@ static INLINE int divide_using_multiply_shift(int num, int shift1,
 
 #define DC_SHIFT2 16
 
-static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+static inline void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
                                      int bh, const uint8_t *above,
                                      const uint8_t *left, int shift1,
                                      int multiplier) {
@@ -362,7 +362,7 @@ void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+static inline void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {
   int r;
@@ -374,7 +374,7 @@ static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+static inline void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {
   int r;
@@ -386,7 +386,7 @@ static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bw, int bh, const uint16_t *above,
                                           const uint16_t *left, int bd) {
   int r, c;
@@ -400,7 +400,7 @@ static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
@@ -433,7 +433,7 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
@@ -464,7 +464,7 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
@@ -495,7 +495,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
@@ -509,7 +509,7 @@ static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
                                             int bw, int bh,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
@@ -526,7 +526,7 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
@@ -543,7 +543,7 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+static inline void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                        int bh, const uint16_t *above,
                                        const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
@@ -579,7 +579,7 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
 
 #define HIGHBD_DC_SHIFT2 17
 
-static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+static inline void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
                                             int bw, int bh,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd,
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 3b8352aff0..8481839136 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -17,12 +17,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
-static INLINE int8_t signed_char_clamp(int t) {
+static inline int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+static inline int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
     case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
@@ -33,7 +33,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
 #endif
 
 // should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+static inline int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
                                   uint8_t p0, uint8_t q0, uint8_t q1) {
   int8_t mask = 0;
   mask |= (abs(p1 - p0) > limit) * -1;
@@ -42,7 +42,7 @@ static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
   return ~mask;
 }
 
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+static inline int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                  uint8_t q1, uint8_t q2, uint8_t q3) {
   int8_t mask = 0;
@@ -56,7 +56,7 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
   return ~mask;
 }
 
-static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+static inline int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
                                          uint8_t p2, uint8_t p1, uint8_t p0,
                                          uint8_t q0, uint8_t q1, uint8_t q2) {
   int8_t mask = 0;
@@ -68,7 +68,7 @@ static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
-static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+static inline int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
                                        uint8_t p0, uint8_t q0, uint8_t q1,
                                        uint8_t q2) {
   int8_t mask = 0;
@@ -79,7 +79,7 @@ static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
   return ~mask;
 }
 
-static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+static inline int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
                                 uint8_t q2, uint8_t q3) {
   int8_t mask = 0;
@@ -93,7 +93,7 @@ static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
 }
 
 // is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+static inline int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                               uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
   hev |= (abs(p1 - p0) > thresh) * -1;
@@ -101,7 +101,7 @@ static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
   return hev;
 }
 
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+static inline void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
@@ -199,7 +199,7 @@ void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
 }
 
-static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void filter6(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
   if (flat && mask) {
@@ -216,7 +216,7 @@ static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
   }
 }
 
-static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
@@ -375,7 +375,7 @@ void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
 }
 
-static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void filter14(int8_t mask, uint8_t thresh, int8_t flat,
                             int8_t flat2, uint8_t *op6, uint8_t *op5,
                             uint8_t *op4, uint8_t *op3, uint8_t *op2,
                             uint8_t *op1, uint8_t *op0, uint8_t *oq0,
@@ -514,7 +514,7 @@ void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+static inline int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
                                          uint16_t p1, uint16_t p0, uint16_t q0,
                                          uint16_t q1, int bd) {
   int8_t mask = 0;
@@ -527,7 +527,7 @@ static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
 }
 
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+static inline int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                         uint16_t p3, uint16_t p2, uint16_t p1,
                                         uint16_t p0, uint16_t q0, uint16_t q1,
                                         uint16_t q2, uint16_t q3, int bd) {
@@ -544,7 +544,7 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
-static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+static inline int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
                                                 uint16_t p2, uint16_t p1,
                                                 uint16_t p0, uint16_t q0,
                                                 uint16_t q1, uint16_t q2,
@@ -560,7 +560,7 @@ static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+static inline int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
                                               uint16_t p1, uint16_t p0,
                                               uint16_t q0, uint16_t q1,
                                               uint16_t q2, int bd) {
@@ -573,7 +573,7 @@ static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+static inline int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
                                        uint16_t p1, uint16_t p0, uint16_t q0,
                                        uint16_t q1, uint16_t q2, uint16_t q3,
                                        int bd) {
@@ -590,7 +590,7 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
 
 // Is there high edge variance internal edge:
 // 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+static inline int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
                                       uint16_t q0, uint16_t q1, int bd) {
   int16_t hev = 0;
   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
@@ -599,7 +599,7 @@ static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
   return hev;
 }
 
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+static inline void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   int bd) {
   int16_t filter1, filter2;
@@ -689,7 +689,7 @@ void aom_highbd_lpf_vertical_4_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
                                   int bd) {
@@ -707,7 +707,7 @@ static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
   }
 }
 
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -841,7 +841,7 @@ void aom_highbd_lpf_vertical_8_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+static inline void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
                                    int8_t flat2, uint16_t *op6, uint16_t *op5,
                                    uint16_t *op4, uint16_t *op3, uint16_t *op2,
                                    uint16_t *op1, uint16_t *op0, uint16_t *oq0,
diff --git a/aom_dsp/mathutils.h b/aom_dsp/mathutils.h
index 45ea6d9b14..746585d6aa 100644
--- a/aom_dsp/mathutils.h
+++ b/aom_dsp/mathutils.h
@@ -21,7 +21,7 @@
 static const double TINY_NEAR_ZERO = 1.0E-16;
 
 // Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
-static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+static inline int linsolve(int n, double *A, int stride, double *b, double *x) {
   int i, j, k;
   double c;
   // Forward elimination
@@ -81,7 +81,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
 // * a, b are the coefficients of each individual equation,
 // * x is the result vector
 // * and n is the problem size
-static INLINE void least_squares_init(double *mat, double *y, int n) {
+static inline void least_squares_init(double *mat, double *y, int n) {
   memset(mat, 0, n * n * sizeof(double));
   memset(y, 0, n * sizeof(double));
 }
@@ -92,7 +92,7 @@ static AOM_FORCE_INLINE int iroundpf(float x) {
   return (int)(x + 0.5f);
 }
 
-static INLINE void least_squares_accumulate(double *mat, double *y,
+static inline void least_squares_accumulate(double *mat, double *y,
                                             const double *a, double b, int n) {
   for (int i = 0; i < n; i++) {
     for (int j = 0; j < n; j++) {
@@ -104,13 +104,13 @@ static INLINE void least_squares_accumulate(double *mat, double *y,
   }
 }
 
-static INLINE int least_squares_solve(double *mat, double *y, double *x,
+static inline int least_squares_solve(double *mat, double *y, double *x,
                                       int n) {
   return linsolve(n, mat, n, y, x);
 }
 
 // Matrix multiply
-static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+static inline void multiply_mat(const double *m1, const double *m2, double *res,
                                 const int m1_rows, const int inner_dim,
                                 const int m2_cols) {
   double sum;
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index b01861d765..ac2cc22db2 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -46,7 +46,7 @@ static const int kMaxLag = 4;
 GET_BLOCK_MEAN(uint8_t, lowbd)
 GET_BLOCK_MEAN(uint16_t, highbd)
 
-static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+static inline double get_block_mean(const uint8_t *data, int w, int h,
                                     int stride, int x_o, int y_o,
                                     int block_size, int use_highbd) {
   if (use_highbd)
@@ -80,7 +80,7 @@ static INLINE double get_block_mean(const uint8_t *data, int w, int h,
 GET_NOISE_VAR(uint8_t, lowbd)
 GET_NOISE_VAR(uint16_t, highbd)
 
-static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+static inline double get_noise_var(const uint8_t *data, const uint8_t *denoised,
                                    int w, int h, int stride, int x_o, int y_o,
                                    int block_size_x, int block_size_y,
                                    int use_highbd) {
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 37d8042b75..bb2c4a9757 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -97,7 +97,7 @@ typedef uint16_t aom_cdf_prob;
       AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
       AOM_ICDF(CDF_PROB_TOP), 0
 
-static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
+static inline uint8_t get_prob(unsigned int num, unsigned int den) {
   assert(den != 0);
   {
     const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
@@ -107,7 +107,7 @@ static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
   }
 }
 
-static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
+static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
   assert(nsymbs < 17);
   const int count = cdf[nsymbs];
 
diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index e056166d87..96a20ba4ff 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -217,7 +217,7 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit) {
 // This must be called after the main image area is filled out.
 // `img_buf` should point to the first pixel in the image area,
 // ie. it should be pyr->level_buffer + pyr->level_loc[level].
-static INLINE void fill_border(uint8_t *img_buf, const int width,
+static inline void fill_border(uint8_t *img_buf, const int width,
                                const int height, const int stride) {
   // Fill left and right areas
   for (int row = 0; row < height; row++) {
@@ -254,7 +254,7 @@ static INLINE void fill_border(uint8_t *img_buf, const int width,
 // or -1 on error.
 //
 // This must only be called while holding frame_pyr->mutex
-static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+static inline int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
                                int n_levels, ImagePyramid *frame_pyr) {
   int already_filled_levels = frame_pyr->filled_levels;
 
diff --git a/aom_dsp/recenter.h b/aom_dsp/recenter.h
index f36c9cba87..0703cb0dfe 100644
--- a/aom_dsp/recenter.h
+++ b/aom_dsp/recenter.h
@@ -17,7 +17,7 @@
 #include "aom/aom_integer.h"
 
 // Inverse recenters a non-negative literal v around a reference r
-static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
   if (v > (r << 1))
     return v;
   else if ((v & 1) == 0)
@@ -28,7 +28,7 @@ static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
 
 // Inverse recenters a non-negative literal v in [0, n-1] around a
 // reference r also in [0, n-1]
-static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
+static inline uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
                                                   uint16_t v) {
   if ((r << 1) <= n) {
     return inv_recenter_nonneg(r, v);
@@ -38,7 +38,7 @@ static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
 }
 
 // Recenters a non-negative literal v around a reference r
-static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+static inline uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
   if (v > (r << 1))
     return v;
   else if (v >= r)
@@ -49,7 +49,7 @@ static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
 
 // Recenters a non-negative literal v in [0, n-1] around a
 // reference r also in [0, n-1]
-static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r,
+static inline uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r,
                                               uint16_t v) {
   if ((r << 1) <= n) {
     return recenter_nonneg(r, v);
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 72ed758370..da8f077dcb 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/blend.h"
 
 /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+static inline unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
                                int b_stride, int width, int height) {
   int y, x;
   unsigned int sad = 0;
@@ -191,7 +191,7 @@ SAD_MXNX3D(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+static inline unsigned int highbd_sad(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride,
                                       int width, int height) {
   int y, x;
@@ -209,7 +209,7 @@ static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
   return sad;
 }
 
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+static inline unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
                                        const uint8_t *b8, int b_stride,
                                        int width, int height) {
   int y, x;
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index 80d7c3fee4..f016f56a9f 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -18,7 +18,7 @@
 #include "aom_ports/mem.h"
 #include "aom_dsp/blend.h"
 
-static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+static inline unsigned int masked_sad(const uint8_t *src, int src_stride,
                                       const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride,
                                       const uint8_t *m, int m_stride, int width,
@@ -94,12 +94,11 @@ MASKSADMxN(64, 16)
 /* clang-format on */
 
 #if CONFIG_AV1_HIGHBITDEPTH
-                            static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
-                                   const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
+                        static inline unsigned int highbd_masked_sad(
+                            const uint8_t *src8, int src_stride,
+                            const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, const uint8_t *m, int m_stride,
+                            int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -164,7 +163,7 @@ HIGHBD_MASKSADMXN(64, 16)
 // pre: predictor being evaluated
 // wsrc: target weighted prediction (has been *4096 to keep precision)
 // mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+static inline unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
                                     const int32_t *wsrc, const int32_t *mask,
                                     int width, int height) {
   int y, x;
@@ -215,10 +214,10 @@ OBMCSADMxN(64, 16)
 /* clang-format on */
 
 #if CONFIG_AV1_HIGHBITDEPTH
-                            static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
+                            static inline unsigned int highbd_obmc_sad(
+                                const uint8_t *pre8, int pre_stride,
+                                const int32_t *wsrc, const int32_t *mask,
+                                int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index 7152732bef..1c675d5aab 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h
@@ -147,7 +147,7 @@ static const tran_high_t sinpi_4_9 = 15212;
 static const tran_high_t Sqrt2 = 23170;
 static const tran_high_t InvSqrt2 = 11585;
 
-static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+static inline tran_high_t fdct_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   return rv;
 }
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 7d64b6300f..e1cc07af64 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -889,7 +889,7 @@ HIGHBD_MASK_SUBPIX_VAR(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+static inline void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
   int i, j;
@@ -996,7 +996,7 @@ OBMC_VAR(64, 16)
 OBMC_SUBPIX_VAR(64, 16)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
                                           uint64_t *sse, int64_t *sum) {
@@ -1019,7 +1019,7 @@ static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
   }
 }
 
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
                                         const int32_t *wsrc,
                                         const int32_t *mask, int w, int h,
                                         unsigned int *sse, int *sum) {
@@ -1030,7 +1030,7 @@ static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)sse64;
 }
 
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
@@ -1041,7 +1041,7 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
diff --git a/aom_dsp/x86/adaptive_quantize_avx2.c b/aom_dsp/x86/adaptive_quantize_avx2.c
index b93e12c184..a66aed3cdd 100644
--- a/aom_dsp/x86/adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -15,7 +15,7 @@
 #include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+static inline void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
                                       const int16_t *round_ptr, __m256i *round,
                                       const int16_t *quant_ptr, __m256i *quant,
                                       const int16_t *dequant_ptr,
@@ -36,13 +36,13 @@ static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
   *shift = _mm256_permute4x64_epi64(*shift, 0x54);
 }
 
-static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
   const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
   const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
   return _mm256_packs_epi32(coeff1, coeff2);
 }
 
-static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+static inline void update_mask1_avx2(__m256i *cmp_mask,
                                      const int16_t *iscan_ptr, int *is_found,
                                      __m256i *mask) {
   __m256i temp_mask = _mm256_setzero_si256();
@@ -54,7 +54,7 @@ static INLINE void update_mask1_avx2(__m256i *cmp_mask,
   *mask = _mm256_max_epi16(temp_mask, *mask);
 }
 
-static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+static inline void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
                                      const int16_t *iscan_ptr, int *is_found,
                                      __m256i *mask) {
   __m256i zero = _mm256_setzero_si256();
@@ -70,7 +70,7 @@ static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
   update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
 }
 
-static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+static inline void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
                                          const __m256i *quant,
                                          const __m256i *shift) {
   __m256i tmp, qcoeff;
@@ -80,11 +80,11 @@ static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
   *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
 }
 
-static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+static inline __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
   return _mm256_mullo_epi16(qcoeff, dequant);
 }
 
-static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+static inline void store_coefficients_avx2(__m256i coeff_vals,
                                            tran_low_t *coeff_ptr) {
   __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
   __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
index 5b90b104a8..efce154252 100644
--- a/aom_dsp/x86/aom_convolve_copy_avx2.c
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -13,7 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+static inline void copy_128(const uint8_t *src, uint8_t *dst) {
   __m256i s[4];
   s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
   s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
@@ -125,7 +125,7 @@ void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   __m256i s[4];
   s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
   s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
@@ -137,7 +137,7 @@ static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
 }
 
-static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
   __m256i s[8];
   s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
   s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index c4121705c2..223b404f62 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -13,7 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+static inline void copy_128(const uint8_t *src, uint8_t *dst) {
   __m128i s[8];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
@@ -144,7 +144,7 @@ void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   __m128i s[8];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
@@ -164,7 +164,7 @@ static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
 }
 
-static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
   __m128i s[16];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
diff --git a/aom_dsp/x86/aom_quantize_avx.c b/aom_dsp/x86/aom_quantize_avx.c
index 1b6ea48c8f..e7f2344370 100644
--- a/aom_dsp/x86/aom_quantize_avx.c
+++ b/aom_dsp/x86/aom_quantize_avx.c
@@ -16,7 +16,7 @@
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+static inline void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
                                                tran_low_t *dqcoeff) {
   const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
   const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 0c4c537a50..e8e94a42c9 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -42,27 +42,27 @@
 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+static inline void xx_storeu2_epi32(const uint8_t *output_ptr,
                                     const ptrdiff_t stride, const __m256i *a) {
   *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
   *((int *)(output_ptr + stride)) =
       _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
 }
 
-static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
   __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
   a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
   return a;
 }
 
-static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+static inline void xx_storeu2_epi64(const uint8_t *output_ptr,
                                     const ptrdiff_t stride, const __m256i *a) {
   _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
   _mm_storel_epi64((__m128i *)(output_ptr + stride),
                    _mm256_extractf128_si256(*a, 1));
 }
 
-static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+static inline void xx_store2_mi128(const uint8_t *output_ptr,
                                    const ptrdiff_t stride, const __m256i *a) {
   _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
   _mm_store_si128((__m128i *)(output_ptr + stride),
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 7bc88ebf5f..9978d23258 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -485,7 +485,7 @@ static void aom_filter_block1d16_v4_ssse3(
   }
 }
 
-static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+static inline __m128i shuffle_filter_convolve8_8_ssse3(
     const __m128i *const s, const int16_t *const filter) {
   __m128i f[4];
   shuffle_filter_ssse3(filter, f);
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index ba6de96d24..000aea945c 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
-static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+static inline void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
                                                    __m256i *out_lo,
                                                    __m256i *out_hi) {
   const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
@@ -141,7 +141,7 @@ void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff,
                       _mm256_permute2x128_si256(src[6], src[7], 0x31));
 }
 
-static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+static inline void hadamard_16x16_avx2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
   DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
@@ -620,7 +620,7 @@ void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
   }
 }
 
-static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+static inline void load_from_src_buf(const uint8_t *ref1, __m256i *src,
                                      const int stride) {
   src[0] = _mm256_loadu_si256((const __m256i *)ref1);
   src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
@@ -642,7 +642,7 @@ static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
       _mm_add_epi16(results0, _mm_srli_si128(results0, 8));                   \
   _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
 
-static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+static inline void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
                                              const int ref_stride,
                                              const int height,
                                              int norm_factor) {
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index f7b133c0c2..0c1c35e9db 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -18,7 +18,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 
-static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+static inline void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
                                                    __m128i *out_lo,
                                                    __m128i *out_hi) {
   const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
@@ -26,7 +26,7 @@ static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
   *out_hi = _mm_unpackhi_epi16(in, sign_bits);
 }
 
-static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+static inline __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
   return _mm_sub_epi32(a, sign);
 }
@@ -181,7 +181,7 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   return (avg + 8) >> 4;
 }
 
-static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
+static inline void hadamard_col4_sse2(__m128i *in, int iter) {
   const __m128i a0 = in[0];
   const __m128i a1 = in[1];
   const __m128i a2 = in[2];
@@ -223,7 +223,7 @@ void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
 }
 
-static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+static inline void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
   __m128i a2 = in[2];
@@ -299,7 +299,7 @@ static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+static inline void hadamard_8x8_sse2(const int16_t *src_diff,
                                      ptrdiff_t src_stride, tran_low_t *coeff,
                                      int is_final) {
   __m128i src[8];
@@ -356,7 +356,7 @@ void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
-static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+static inline void hadamard_lp_8x8_sse2(const int16_t *src_diff,
                                         ptrdiff_t src_stride, int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
@@ -439,7 +439,7 @@ void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+static inline void hadamard_16x16_sse2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
   // For high bitdepths, it is unnecessary to store_tran_low
diff --git a/aom_dsp/x86/bitdepth_conversion_avx2.h b/aom_dsp/x86/bitdepth_conversion_avx2.h
index 9b2b2b01ee..851f0be175 100644
--- a/aom_dsp/x86/bitdepth_conversion_avx2.h
+++ b/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -15,13 +15,13 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-static INLINE __m256i load_tran_low(const tran_low_t *a) {
+static inline __m256i load_tran_low(const tran_low_t *a) {
   const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
   const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
   return _mm256_packs_epi32(a_low, a_high);
 }
 
-static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+static inline void store_tran_low(__m256i a, tran_low_t *b) {
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i a_hi = _mm256_mulhi_epi16(a, one);
   const __m256i a_lo = _mm256_mullo_epi16(a, one);
diff --git a/aom_dsp/x86/bitdepth_conversion_sse2.h b/aom_dsp/x86/bitdepth_conversion_sse2.h
index 7b634b2839..b6989316d9 100644
--- a/aom_dsp/x86/bitdepth_conversion_sse2.h
+++ b/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -17,12 +17,12 @@
 
 // Load 8 16 bit values. If the source is 32 bits then pack down with
 // saturation.
-static INLINE __m128i load_tran_low(const tran_low_t *a) {
+static inline __m128i load_tran_low(const tran_low_t *a) {
   const __m128i a_low = _mm_load_si128((const __m128i *)a);
   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 }
 
-static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
+static inline void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_hi = _mm_mulhi_epi16(a, one);
   const __m128i a_lo = _mm_mullo_epi16(a, one);
@@ -32,7 +32,7 @@ static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
 
 // Store 8 16 bit values. If the destination is 32 bits then sign extend the
 // values by multiplying by 1.
-static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+static inline void store_tran_low(__m128i a, tran_low_t *b) {
   __m128i a_1, a_2;
   unpack_trans(a, &a_1, &a_2);
   _mm_store_si128((__m128i *)(b), a_1);
@@ -41,7 +41,7 @@ static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
 // Stores the second result at an offset of 8 (instead of 4) to match the output
 // with that of AVX2 implementation and the function is similar to
 // store_tran_low().
-static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
+static inline void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
   __m128i a_1, a_2;
   unpack_trans(a, &a_1, &a_2);
   _mm_store_si128((__m128i *)(b), a_1);
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 2b7fe838d6..9371237b5a 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -25,7 +25,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void blend_a64_d16_mask_w16_avx2(
+static inline void blend_a64_d16_mask_w16_avx2(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
     int shift) {
@@ -46,7 +46,7 @@ static INLINE void blend_a64_d16_mask_w16_avx2(
   _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
 }
 
-static INLINE void blend_a64_d16_mask_w32_avx2(
+static inline void blend_a64_d16_mask_w32_avx2(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
     const __m256i *v_maxval, int shift) {
@@ -79,7 +79,7 @@ static INLINE void blend_a64_d16_mask_w32_avx2(
   _mm256_storeu_si256((__m256i *)(dst), res);
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -98,7 +98,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -120,7 +120,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -145,7 +145,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -177,7 +177,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -201,7 +201,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -228,7 +228,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -253,7 +253,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -401,7 +401,7 @@ void aom_lowbd_blend_a64_d16_mask_avx2(
   }
 }
 
-static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+static inline __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
                                        const __m256i *v_m0_b,
                                        const __m256i *v_m1_b,
                                        const int32_t bits) {
@@ -420,7 +420,7 @@ static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
   return v_res;
 }
 
-static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+static inline __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
                                        const __m256i *v_m0_b,
                                        const __m256i *v_m1_b,
                                        const int32_t bits) {
@@ -440,7 +440,7 @@ static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
   return v_res;
 }
 
-static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+static inline void blend_a64_mask_sx_sy_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h) {
@@ -470,7 +470,7 @@ static INLINE void blend_a64_mask_sx_sy_w16_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+static inline void blend_a64_mask_sx_sy_w32n_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -512,7 +512,7 @@ static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sx_sy_avx2(
+static inline void blend_a64_mask_sx_sy_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -577,7 +577,7 @@ static INLINE void blend_a64_mask_sx_sy_avx2(
   }
 }
 
-static INLINE void blend_a64_mask_sx_w16_avx2(
+static inline void blend_a64_mask_sx_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h) {
@@ -603,7 +603,7 @@ static INLINE void blend_a64_mask_sx_w16_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sx_w32n_avx2(
+static inline void blend_a64_mask_sx_w32n_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -637,7 +637,7 @@ static INLINE void blend_a64_mask_sx_w32n_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sx_avx2(
+static inline void blend_a64_mask_sx_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -694,7 +694,7 @@ static INLINE void blend_a64_mask_sx_avx2(
   }
 }
 
-static INLINE void blend_a64_mask_sy_w16_avx2(
+static inline void blend_a64_mask_sy_w16_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h) {
@@ -716,7 +716,7 @@ static INLINE void blend_a64_mask_sy_w16_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sy_w32n_avx2(
+static inline void blend_a64_mask_sy_w32n_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -740,7 +740,7 @@ static INLINE void blend_a64_mask_sy_w32n_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_sy_avx2(
+static inline void blend_a64_mask_sy_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -789,7 +789,7 @@ static INLINE void blend_a64_mask_sy_avx2(
   }
 }
 
-static INLINE void blend_a64_mask_w32n_avx2(
+static inline void blend_a64_mask_w32n_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -812,7 +812,7 @@ static INLINE void blend_a64_mask_w32n_avx2(
   } while (--h);
 }
 
-static INLINE void blend_a64_mask_avx2(
+static inline void blend_a64_mask_avx2(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
@@ -904,7 +904,7 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
 // aom_highbd_blend_a64_d16_mask_avx2()
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+static inline void highbd_blend_a64_d16_mask_w4_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
     const __m256i *round_offset, int shift, const __m256i *clip_low,
@@ -954,7 +954,7 @@ static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
   xx_storel_64(dst + 0 * dst_stride, clipl);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -979,7 +979,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -1017,7 +1017,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+static inline void highbd_blend_a64_d16_mask_w8_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
     const __m256i *mask0b, const __m256i *round_offset, int shift,
@@ -1090,7 +1090,7 @@ static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
   yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
     int mask_stride, int h, const __m256i *round_offset, int shift,
@@ -1117,7 +1117,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
     int mask_stride, int h, const __m256i *round_offset, int shift,
@@ -1157,7 +1157,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+static inline void highbd_blend_a64_d16_mask_w16_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
     const __m256i *mask0b, const __m256i *round_offset, int shift,
@@ -1227,7 +1227,7 @@ static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
   yy_storeu_256(dst + dst_stride, clipb);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
     int mask_stride, int h, int w, const __m256i *round_offset, int shift,
@@ -1252,7 +1252,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
   }
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
     int mask_stride, int h, int w, const __m256i *round_offset, int shift,
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index df0fada68b..165ca1dc21 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -428,7 +428,7 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_a64_mask_bn_w4_sse4_1(
+static inline void blend_a64_mask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
@@ -468,7 +468,7 @@ static void blend_a64_mask_b12_w4_sse4_1(
                               src1_stride, mask, mask_stride, h, blend_4_b12);
 }
 
-static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+static inline void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
@@ -515,7 +515,7 @@ static void blend_a64_mask_b12_w8n_sse4_1(
 // Horizontal sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+static inline void blend_a64_mask_bn_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
@@ -561,7 +561,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1(
                                  blend_4_b12);
 }
 
-static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+static inline void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
@@ -612,7 +612,7 @@ static void blend_a64_mask_b12_sx_w8n_sse4_1(
 // Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+static inline void blend_a64_mask_bn_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
@@ -657,7 +657,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1(
                                  blend_4_b12);
 }
 
-static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+static inline void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
@@ -707,7 +707,7 @@ static void blend_a64_mask_b12_sy_w8n_sse4_1(
 // Horizontal and Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
@@ -758,7 +758,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
                                     blend_4_b12);
 }
 
-static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
@@ -872,7 +872,7 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void blend_a64_d16_mask_w16_sse41(
+static inline void blend_a64_d16_mask_w16_sse41(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
     const __m128i *v_maxval, int shift) {
@@ -901,7 +901,7 @@ static INLINE void blend_a64_d16_mask_w16_sse41(
   _mm_storeu_si128((__m128i *)(dst), res);
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -923,7 +923,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -955,7 +955,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -982,7 +982,7 @@ static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
   }
 }
 
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -1113,7 +1113,7 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
 // aom_highbd_blend_a64_d16_mask_sse4_1()
 //////////////////////////////////////////////////////////////////////////////
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+static inline void highbd_blend_a64_d16_mask_w4_sse4_1(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
     const __m128i *mask0b, const __m128i *round_offset, int shift,
@@ -1178,7 +1178,7 @@ static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
   xx_storel_64(dst + 3 * dst_stride, clipb);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -1204,7 +1204,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -1240,7 +1240,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
   } while (h -= 4);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+static inline void highbd_blend_a64_d16_mask_w8_sse4_1(
     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
     const __m128i *mask0b, const __m128i *round_offset, int shift,
@@ -1303,7 +1303,7 @@ static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
   xx_storeu_128(dst + dst_stride, clipb);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -1323,7 +1323,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
   } while (h -= 2);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -1354,7 +1354,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
   } while (h -= 2);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+static inline void highbd_blend_a64_d16_mask_w16_sse4_1(
     uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m128i *round_offset, int shift, const __m128i *mask0l,
     const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
@@ -1419,7 +1419,7 @@ static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
   xx_storeu_128(dst + 8, cliph);
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
@@ -1443,7 +1443,7 @@ static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
   }
 }
 
-static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, int w,
diff --git a/aom_dsp/x86/blend_a64_vmask_sse4.c b/aom_dsp/x86/blend_a64_vmask_sse4.c
index 484d3d08e4..53a38bf774 100644
--- a/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -148,7 +148,7 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+static inline void blend_a64_vmask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, int h, blend_unit_fn blend) {
@@ -191,7 +191,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                src1_stride, mask, h, blend_4_b12);
 }
 
-static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, int w, int h, blend_unit_fn blend) {
diff --git a/aom_dsp/x86/blend_mask_sse4.h b/aom_dsp/x86/blend_mask_sse4.h
index e7b160e41a..8eb3982b4a 100644
--- a/aom_dsp/x86/blend_mask_sse4.h
+++ b/aom_dsp/x86/blend_mask_sse4.h
@@ -24,7 +24,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void blend_a64_d16_mask_w4_sse41(
+static inline void blend_a64_d16_mask_w4_sse41(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
     int shift) {
@@ -42,7 +42,7 @@ static INLINE void blend_a64_d16_mask_w4_sse41(
   xx_storel_32(dst, res);
 }
 
-static INLINE void blend_a64_d16_mask_w8_sse41(
+static inline void blend_a64_d16_mask_w8_sse41(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
     int shift) {
@@ -61,7 +61,7 @@ static INLINE void blend_a64_d16_mask_w8_sse41(
   _mm_storel_epi64((__m128i *)(dst), res);
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -80,7 +80,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -98,7 +98,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -123,7 +123,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -148,7 +148,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -170,7 +170,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -191,7 +191,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
     src1 += src1_stride;
   }
 }
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
@@ -213,7 +213,7 @@ static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
   }
 }
 
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h,
diff --git a/aom_dsp/x86/blend_sse4.h b/aom_dsp/x86/blend_sse4.h
index 28e531103e..ac1228b1e2 100644
--- a/aom_dsp/x86/blend_sse4.h
+++ b/aom_dsp/x86/blend_sse4.h
@@ -23,7 +23,7 @@ static const uint8_t g_blend_a64_mask_shuffle[32] = {
 // Common kernels
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+static inline __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
                               const __m128i *v_m0_w, const __m128i *v_m1_w) {
   const __m128i v_s0_b = xx_loadl_32(src0);
   const __m128i v_s1_b = xx_loadl_32(src1);
@@ -38,7 +38,7 @@ static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+static inline __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
                               const __m128i *v_m0_w, const __m128i *v_m1_w) {
   const __m128i v_s0_b = xx_loadl_64(src0);
   const __m128i v_s1_b = xx_loadl_64(src1);
@@ -55,7 +55,7 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
-static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+static inline __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
                                  const __m128i *rounding) {
   const __m128i v_s0_b = xx_loadl_32(src0);
@@ -69,7 +69,7 @@ static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
   return v_res;
 }
 
-static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+static inline __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
                                  const __m128i *rounding) {
   const __m128i v_s0_b = xx_loadl_64(src0);
@@ -83,7 +83,7 @@ static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
   return v_res;
 }
 
-static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+static inline __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
                                   const __m128i *v_m0_b, const __m128i *v_m1_b,
                                   const __m128i *rounding) {
   const __m128i v_s0_b = xx_loadu_128(src0);
@@ -103,7 +103,7 @@ static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w);
 
-static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+static inline __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
   const __m128i v_s0_w = xx_loadl_64(src0);
   const __m128i v_s1_w = xx_loadl_64(src1);
@@ -118,7 +118,7 @@ static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
   return v_res_w;
 }
 
-static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+static inline __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
   const __m128i v_s0_w = xx_loadu_128(src0);
   const __m128i v_s1_w = xx_loadu_128(src1);
@@ -133,7 +133,7 @@ static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
   return v_res_w;
 }
 
-static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+static inline __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
   const __m128i v_s0_w = xx_loadl_64(src0);
   const __m128i v_s1_w = xx_loadl_64(src1);
@@ -158,7 +158,7 @@ static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
   return v_res_w;
 }
 
-static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+static inline __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
   const __m128i v_s0_w = xx_loadu_128(src0);
   const __m128i v_s1_w = xx_loadu_128(src1);
diff --git a/aom_dsp/x86/blk_sse_sum_avx2.c b/aom_dsp/x86/blk_sse_sum_avx2.c
index 7169607c09..b790236883 100644
--- a/aom_dsp/x86/blk_sse_sum_avx2.c
+++ b/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -13,7 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+static inline void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
                                       int *x_sum, int64_t *x2_sum) {
   __m256i sum_buffer, sse_buffer;
   __m128i out_buffer;
@@ -42,7 +42,7 @@ static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
 #endif
 }
 
-static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+static inline void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
                                     int *x_sum, int64_t *x2_sum) {
   __m128i row1, row2, row3;
   __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
@@ -82,7 +82,7 @@ static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
   accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
 }
 
-static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+static inline void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
                                     int *x_sum, int64_t *x2_sum) {
   __m128i load_128bit, load_next_128bit;
   __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
@@ -117,7 +117,7 @@ static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
   accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
 }
 
-static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+static inline void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
                                      int *x_sum, int64_t *x2_sum,
                                      int loop_count) {
   __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
diff --git a/aom_dsp/x86/blk_sse_sum_sse2.c b/aom_dsp/x86/blk_sse_sum_sse2.c
index 8b816d6818..cf58a4f5f3 100644
--- a/aom_dsp/x86/blk_sse_sum_sse2.c
+++ b/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -13,7 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+static inline void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
                                     int *x_sum, int64_t *x2_sum) {
   const int16_t *data_tmp = data;
   __m128i temp_buffer1, temp_buffer2;
@@ -52,7 +52,7 @@ static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
 #endif
 }
 
-static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+static inline void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
                                     int *x_sum, int64_t *x2_sum,
                                     int loop_cycles) {
   const int16_t *data_tmp;
diff --git a/aom_dsp/x86/common_avx2.h b/aom_dsp/x86/common_avx2.h
index 2f40dbbee9..ddfc878e4b 100644
--- a/aom_dsp/x86/common_avx2.h
+++ b/aom_dsp/x86/common_avx2.h
@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 
 // Note: in and out could have the same value
-static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+static inline void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
   __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
   __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
   __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index 6658b2243d..7db8a2da83 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -607,7 +607,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
     }                                                                          \
   } while (0)
 
-static INLINE void prepare_coeffs_lowbd(
+static inline void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -635,7 +635,7 @@ static INLINE void prepare_coeffs_lowbd(
   coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
 }
 
-static INLINE void prepare_coeffs_6t_lowbd(
+static inline void prepare_coeffs_6t_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -661,7 +661,7 @@ static INLINE void prepare_coeffs_6t_lowbd(
   coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
 }
 
-static INLINE void prepare_coeffs_6t(
+static inline void prepare_coeffs_6t(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -678,7 +678,7 @@ static INLINE void prepare_coeffs_6t(
   coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
 }
 
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m256i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -697,7 +697,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
   coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
 }
 
-static INLINE void prepare_coeffs_12taps(
+static inline void prepare_coeffs_12taps(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -721,7 +721,7 @@ static INLINE void prepare_coeffs_12taps(
   coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
 }
 
-static INLINE __m256i convolve_lowbd(const __m256i *const s,
+static inline __m256i convolve_lowbd(const __m256i *const s,
                                      const __m256i *const coeffs) {
   const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
@@ -735,7 +735,7 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+static inline __m256i convolve_lowbd_6tap(const __m256i *const s,
                                           const __m256i *const coeffs) {
   const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
@@ -748,7 +748,7 @@ static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+static inline __m256i convolve_lowbd_4tap(const __m256i *const s,
                                           const __m256i *const coeffs) {
   const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
@@ -759,7 +759,7 @@ static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_6tap(const __m256i *const s,
+static inline __m256i convolve_6tap(const __m256i *const s,
                                     const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
@@ -770,7 +770,7 @@ static INLINE __m256i convolve_6tap(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_12taps(const __m256i *const s,
+static inline __m256i convolve_12taps(const __m256i *const s,
                                       const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
@@ -786,7 +786,7 @@ static INLINE __m256i convolve_12taps(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve(const __m256i *const s,
+static inline __m256i convolve(const __m256i *const s,
                                const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
@@ -799,7 +799,7 @@ static INLINE __m256i convolve(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_4tap(const __m256i *const s,
+static inline __m256i convolve_4tap(const __m256i *const s,
                                     const __m256i *const coeffs) {
   const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
   const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
@@ -808,7 +808,7 @@ static INLINE __m256i convolve_4tap(const __m256i *const s,
   return res;
 }
 
-static INLINE __m256i convolve_lowbd_x(const __m256i data,
+static inline __m256i convolve_lowbd_x(const __m256i data,
                                        const __m256i *const coeffs,
                                        const __m256i *const filt) {
   __m256i s[4];
@@ -821,7 +821,7 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data,
   return convolve_lowbd(s, coeffs);
 }
 
-static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+static inline __m256i convolve_lowbd_x_6tap(const __m256i data,
                                             const __m256i *const coeffs,
                                             const __m256i *const filt) {
   __m256i s[4];
@@ -833,7 +833,7 @@ static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
   return convolve_lowbd_6tap(s, coeffs);
 }
 
-static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+static inline __m256i convolve_lowbd_x_4tap(const __m256i data,
                                             const __m256i *const coeffs,
                                             const __m256i *const filt) {
   __m256i s[2];
@@ -844,7 +844,7 @@ static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
   return convolve_lowbd_4tap(s, coeffs);
 }
 
-static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst,
                                          const __m256i *const res,
                                          const int do_average) {
   __m256i d;
@@ -858,7 +858,7 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
   _mm256_store_si256((__m256i *)dst, d);
 }
 
-static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+static inline __m256i comp_avg(const __m256i *const data_ref_0,
                                const __m256i *const res_unsigned,
                                const __m256i *const wt,
                                const int use_dist_wtd_comp_avg) {
@@ -881,7 +881,7 @@ static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
   return res;
 }
 
-static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+static inline __m256i convolve_rounding(const __m256i *const res_unsigned,
                                         const __m256i *const offset_const,
                                         const __m256i *const round_const,
                                         const int round_shift) {
@@ -891,7 +891,7 @@ static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
   return res_round;
 }
 
-static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0,
                                       const __m256i *const res_unsigned,
                                       const __m256i *const wt0,
                                       const __m256i *const wt1,
@@ -909,7 +909,7 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
   return res;
 }
 
-static INLINE __m256i highbd_convolve_rounding(
+static inline __m256i highbd_convolve_rounding(
     const __m256i *const res_unsigned, const __m256i *const offset_const,
     const __m256i *const round_const, const int round_shift) {
   const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 094229d484..a9188b3e64 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -15,7 +15,7 @@
 // Note:
 //  This header file should be put below any x86 intrinsics head file
 
-static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+static inline void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
                              const int do_average) {
   __m128i d;
   if (do_average) {
@@ -28,7 +28,7 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
   _mm_store_si128((__m128i *)dst, d);
 }
 
-static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
+static inline void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
                                         int subpel_q4,
                                         __m128i *coeffs /* [6] */) {
   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
@@ -48,7 +48,7 @@ static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
       _mm_shuffle_epi32(coeffs_y, 85);  // coeffs 10 11 10 11 10 11 10 11
 }
 
-static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
+static inline __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
   const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
   const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
   const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
@@ -61,7 +61,7 @@ static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
   return d;
 }
 
-static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
+static inline __m128i convolve_lo_x_12tap(const __m128i *s,
                                           const __m128i *coeffs,
                                           const __m128i zero) {
   __m128i ss[6];
@@ -74,7 +74,7 @@ static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
   return convolve_12tap(ss, coeffs);
 }
 
-static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+static inline __m128i convolve_lo_y_12tap(const __m128i *s,
                                           const __m128i *coeffs) {
   __m128i ss[6];
   const __m128i zero = _mm_setzero_si128();
@@ -87,7 +87,7 @@ static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
   return convolve_12tap(ss, coeffs);
 }
 
-static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+static inline __m128i convolve_hi_y_12tap(const __m128i *s,
                                           const __m128i *coeffs) {
   __m128i ss[6];
   const __m128i zero = _mm_setzero_si128();
diff --git a/aom_dsp/x86/convolve_sse2.h b/aom_dsp/x86/convolve_sse2.h
index 7c25a00011..b8676f29be 100644
--- a/aom_dsp/x86/convolve_sse2.h
+++ b/aom_dsp/x86/convolve_sse2.h
@@ -16,7 +16,7 @@
 
 // Note:
 //  This header file should be put below any x86 intrinsics head file
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -33,7 +33,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
   coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
 }
 
-static INLINE __m128i convolve(const __m128i *const s,
+static inline __m128i convolve(const __m128i *const s,
                                const __m128i *const coeffs) {
   const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
   const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
@@ -46,7 +46,7 @@ static INLINE __m128i convolve(const __m128i *const s,
   return res;
 }
 
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
+static inline __m128i convolve_lo_x(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
@@ -56,7 +56,7 @@ static INLINE __m128i convolve_lo_x(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
+static inline __m128i convolve_lo_y(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
@@ -66,7 +66,7 @@ static INLINE __m128i convolve_lo_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
+static inline __m128i convolve_hi_y(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
@@ -76,7 +76,7 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+static inline __m128i comp_avg(const __m128i *const data_ref_0,
                                const __m128i *const res_unsigned,
                                const __m128i *const wt,
                                const int use_dist_wtd_avg) {
@@ -99,7 +99,7 @@ static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
   return res;
 }
 
-static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+static inline __m128i convolve_rounding(const __m128i *const res_unsigned,
                                         const __m128i *const offset_const,
                                         const __m128i *const round_const,
                                         const int round_shift) {
@@ -109,7 +109,7 @@ static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
   return res_round;
 }
 
-static INLINE __m128i highbd_convolve_rounding_sse2(
+static inline __m128i highbd_convolve_rounding_sse2(
     const __m128i *const res_unsigned, const __m128i *const offset_const,
     const __m128i *const round_const, const int round_shift) {
   const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
diff --git a/aom_dsp/x86/convolve_sse4_1.h b/aom_dsp/x86/convolve_sse4_1.h
index 33b1b83af8..a0e5f45209 100644
--- a/aom_dsp/x86/convolve_sse4_1.h
+++ b/aom_dsp/x86/convolve_sse4_1.h
@@ -15,7 +15,7 @@
 // Note:
 //  This header file should be put below any x86 intrinsics head file
 
-static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+static inline void mult_add_store(CONV_BUF_TYPE *const dst,
                                   const __m128i *const res,
                                   const __m128i *const wt0,
                                   const __m128i *const wt1,
@@ -31,7 +31,7 @@ static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
   _mm_store_si128((__m128i *)dst, d);
 }
 
-static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+static inline __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
                                              const __m128i *const res_unsigned,
                                              const __m128i *const wt0,
                                              const __m128i *const wt1,
diff --git a/aom_dsp/x86/convolve_ssse3.h b/aom_dsp/x86/convolve_ssse3.h
index 288468b1f0..79cdd6151b 100644
--- a/aom_dsp/x86/convolve_ssse3.h
+++ b/aom_dsp/x86/convolve_ssse3.h
@@ -14,7 +14,7 @@
 
 #include <tmmintrin.h>  // SSSE3
 
-static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+static inline void shuffle_filter_ssse3(const int16_t *const filter,
                                         __m128i *const f) {
   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
   // pack and duplicate the filter values
@@ -24,7 +24,7 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
 }
 
-static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+static inline __m128i convolve8_8_ssse3(const __m128i *const s,
                                         const __m128i *const f) {
   // multiply 2 adjacent elements with the filter and add the result
   const __m128i k_64 = _mm_set1_epi16(1 << 6);
diff --git a/aom_dsp/x86/fft_avx2.c b/aom_dsp/x86/fft_avx2.c
index 5b3eab1d48..cd05993e88 100644
--- a/aom_dsp/x86/fft_avx2.c
+++ b/aom_dsp/x86/fft_avx2.c
@@ -20,13 +20,13 @@ extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
                                           int n);
 
 // Generate the 1d forward transforms for float using _mm256
-GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_FFT_8(static inline void, avx2, float, __m256, _mm256_load_ps,
           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
           _mm256_mul_ps)
-GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_FFT_16(static inline void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
            _mm256_mul_ps)
-GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_FFT_32(static inline void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
            _mm256_mul_ps)
 
@@ -46,13 +46,13 @@ void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
 }
 
 // Generate the 1d inverse transforms for float using _mm256
-GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_IFFT_8(static inline void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
            _mm256_mul_ps)
-GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_IFFT_16(static inline void, avx2, float, __m256, _mm256_load_ps,
             _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
             _mm256_mul_ps)
-GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+GEN_IFFT_32(static inline void, avx2, float, __m256, _mm256_load_ps,
             _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
             _mm256_mul_ps)
 
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
index f73897acec..660b6a6f9d 100644
--- a/aom_dsp/x86/fft_sse2.c
+++ b/aom_dsp/x86/fft_sse2.c
@@ -15,7 +15,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
 
-static INLINE void transpose4x4(const float *A, float *B, const int lda,
+static inline void transpose4x4(const float *A, float *B, const int lda,
                                 const int ldb) {
   __m128 row1 = _mm_load_ps(&A[0 * lda]);
   __m128 row2 = _mm_load_ps(&A[1 * lda]);
@@ -111,13 +111,13 @@ void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
 }
 
 // Generate definitions for 1d transforms using float and __mm128
-GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_FFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
           _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
-GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_FFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
-GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_FFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
-GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_FFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 
 void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
@@ -141,13 +141,13 @@ void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
 }
 
 // Generate definitions for 1d inverse transforms using float and mm128
-GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_IFFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
            _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
-GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_IFFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
-GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_IFFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
-GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+GEN_IFFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 
 void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 3e418581c6..f8ad3592a7 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+static inline __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
   buf0 = _mm_mul_epu32(a, b);
   a = _mm_srli_epi64(a, 32);
@@ -25,13 +25,13 @@ static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   return _mm_add_epi64(buf0, buf1);
 }
 
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+static inline __m128i k_packs_epi64(__m128i a, __m128i b) {
   __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
   __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
   return _mm_unpacklo_epi64(buf0, buf1);
 }
 
-static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+static inline int check_epi16_overflow_x2(const __m128i *preg0,
                                           const __m128i *preg1) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
   const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
@@ -43,7 +43,7 @@ static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
   return _mm_movemask_epi8(cmp0);
 }
 
-static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+static inline int check_epi16_overflow_x4(const __m128i *preg0,
                                           const __m128i *preg1,
                                           const __m128i *preg2,
                                           const __m128i *preg3) {
@@ -61,7 +61,7 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
   return _mm_movemask_epi8(cmp0);
 }
 
-static INLINE int check_epi16_overflow_x8(
+static inline int check_epi16_overflow_x8(
     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     const __m128i *preg6, const __m128i *preg7) {
@@ -71,7 +71,7 @@ static INLINE int check_epi16_overflow_x8(
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x12(
+static inline int check_epi16_overflow_x12(
     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
@@ -83,7 +83,7 @@ static INLINE int check_epi16_overflow_x12(
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x16(
+static inline int check_epi16_overflow_x16(
     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
@@ -100,7 +100,7 @@ static INLINE int check_epi16_overflow_x16(
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x32(
+static inline int check_epi16_overflow_x32(
     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
@@ -135,7 +135,7 @@ static INLINE int check_epi16_overflow_x32(
   return res0 + res1;
 }
 
-static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+static inline void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
@@ -144,7 +144,7 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
 }
 
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+static inline void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
diff --git a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
index a07585f2a5..9463f9fb46 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void highbd_load_b_values_avx2(
+static inline void highbd_load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
     __m256i *round, const int16_t *quant_ptr, __m256i *quant,
     const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
@@ -31,7 +31,7 @@ static INLINE void highbd_load_b_values_avx2(
   *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
 }
 
-static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+static inline void highbd_update_mask1_avx2(__m256i *cmp_mask,
                                             const int16_t *iscan_ptr,
                                             int *is_found, __m256i *mask) {
   __m256i temp_mask = _mm256_setzero_si256();
@@ -43,7 +43,7 @@ static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
   *mask = _mm256_max_epi16(temp_mask, *mask);
 }
 
-static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+static inline void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
                                             __m256i *threshold,
                                             const int16_t *iscan_ptr,
                                             int *is_found, __m256i *mask) {
@@ -57,7 +57,7 @@ static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
   highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
 }
 
-static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+static inline void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
                                          __m256i *p, const int shift) {
   __m256i prod_lo = _mm256_mul_epi32(*x, *y);
   __m256i prod_hi = _mm256_srli_epi64(*x, 32);
@@ -71,7 +71,7 @@ static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
   *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
 }
 
-static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+static inline void highbd_calculate_qcoeff_avx2(__m256i *coeff,
                                                 const __m256i *round,
                                                 const __m256i *quant,
                                                 const __m256i *shift,
@@ -83,19 +83,19 @@ static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
   highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
 }
 
-static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+static inline __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
                                                     __m256i dequant) {
   return _mm256_mullo_epi32(qcoeff, dequant);
 }
 
-static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+static inline __m256i highbd_calculate_dqcoeff_log_scale_avx2(
     __m256i qcoeff, __m256i dequant, const int log_scale) {
   __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
   highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
   return _mm256_sign_epi32(abs_coeff, qcoeff);
 }
 
-static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+static inline void highbd_store_coefficients_avx2(__m256i coeff0,
                                                   __m256i coeff1,
                                                   tran_low_t *coeff_ptr) {
   _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
diff --git a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
index 333e9f6995..d482fc32c7 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+static inline __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
   return _mm_sub_epi64(a, sign);
 }
 
-static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+static inline void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
                                          __m128i *p, const int shift) {
   __m128i sign = _mm_srai_epi32(*y, 31);
   __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
@@ -43,7 +43,7 @@ static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
   *p = _mm_or_si128(prod_lo, prod_hi);
 }
 
-static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+static inline void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
                                            const __m128i *quant,
                                            const __m128i *shift,
                                            const int *log_scale) {
@@ -54,7 +54,7 @@ static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
   highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
 }
 
-static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+static inline void highbd_update_mask1(__m128i *cmp_mask0,
                                        const int16_t *iscan_ptr, int *is_found,
                                        __m128i *mask) {
   __m128i temp_mask = _mm_setzero_si128();
@@ -67,7 +67,7 @@ static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
   *mask = _mm_max_epi16(temp_mask, *mask);
 }
 
-static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+static inline void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
                                        __m128i *threshold,
                                        const int16_t *iscan_ptr, int *is_found,
                                        __m128i *mask) {
@@ -83,7 +83,7 @@ static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
   highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
 }
 
-static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+static inline __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
                                                const int log_scale) {
   __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
   __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index 8a234b8a29..a98e14bda1 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -307,7 +307,7 @@ static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
 // -----------------------------------------------------------------------------
 // Horizontal Filtering
 
-static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
   const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
   const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
@@ -321,7 +321,7 @@ static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
 
 // Note:
 //  Shared by 8x2 and 16x1 block
-static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1,
                                   __m256i *x /*x[8]*/) {
   __m256i pp[8];
   pack_pixels(s0, pp);
@@ -336,7 +336,7 @@ static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
   x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
 }
 
-static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
   __m256i pp[8];
   __m256i s0;
   s0 = _mm256_loadu_si256((const __m256i *)src);
@@ -347,7 +347,7 @@ static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
   x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
 }
 
-static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
                                    __m256i *x) {
   __m256i s0, s1;
   s0 = _mm256_loadu_si256((const __m256i *)src);
@@ -355,7 +355,7 @@ static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
   pack_16_pixels(&s0, &s1, x);
 }
 
-static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
   __m256i s0, s1;
   s0 = _mm256_loadu_si256((const __m256i *)src);
   s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
@@ -364,7 +364,7 @@ static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
 
 // Note:
 //  Shared by horizontal and vertical filtering
-static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
   const __m256i p0 = _mm256_set1_epi32(0x03020100);
@@ -377,7 +377,7 @@ static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
   f[3] = _mm256_shuffle_epi8(hh, p3);
 }
 
-static INLINE void pack_filters_4tap(const int16_t *filter,
+static inline void pack_filters_4tap(const int16_t *filter,
                                      __m256i *f /*f[4]*/) {
   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
   const __m256i coeff = _mm256_broadcastsi128_si256(h);
@@ -388,7 +388,7 @@ static INLINE void pack_filters_4tap(const int16_t *filter,
   f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
 }
 
-static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
                                      const __m256i *fil /*fil[4]*/,
                                      __m256i *y) {
   __m256i a, a0, a1;
@@ -415,7 +415,7 @@ static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
   }
 }
 
-static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask,
                                     uint16_t *dst) {
   const __m128i a0 = _mm256_castsi256_si128(*y);
   const __m128i a1 = _mm256_extractf128_si256(*y, 1);
@@ -424,7 +424,7 @@ static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
   _mm_storeu_si128((__m128i *)dst, res);
 }
 
-static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
                                     const __m256i *mask, uint16_t *dst,
                                     ptrdiff_t pitch) {
   __m256i a = _mm256_packus_epi32(*y0, *y1);
@@ -433,7 +433,7 @@ static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
 }
 
-static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
                                      const __m256i *mask, uint16_t *dst) {
   __m256i a = _mm256_packus_epi32(*y0, *y1);
   a = _mm256_min_epi16(a, *mask);
@@ -661,7 +661,7 @@ static void aom_highbd_filter_block1d16_h4_avx2(
 // -----------------------------------------------------------------------------
 // 2-tap horizontal filtering
 
-static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+static inline void pack_2t_filter(const int16_t *filter, __m256i *f) {
   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
   const __m256i p = _mm256_set1_epi32(0x09080706);
@@ -671,7 +671,7 @@ static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
 // the difference is s0/s1 specifies first and second rows or,
 // first 16 samples and 8-sample shifted 16 samples
-static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
                                      __m256i *sig) {
   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
@@ -685,21 +685,21 @@ static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
   sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
 }
 
-static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+static inline void pack_8x2_2t_pixels(const uint16_t *src,
                                       const ptrdiff_t pitch, __m256i *sig) {
   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
   pack_16_2t_pixels(&r0, &r1, sig);
 }
 
-static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+static inline void pack_16x1_2t_pixels(const uint16_t *src,
                                        __m256i *sig /*sig[2]*/) {
   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
   pack_16_2t_pixels(&r0, &r1, sig);
 }
 
-static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+static inline void pack_8x1_2t_pixels(const uint16_t *src,
                                       __m256i *sig /*sig[2]*/) {
   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
@@ -711,7 +711,7 @@ static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
 }
 
 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
-static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
                                        __m256i *y0, __m256i *y1) {
   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
@@ -722,7 +722,7 @@ static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
   *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
 }
 
-static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
                                         __m256i *y0) {
   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
@@ -810,7 +810,7 @@ static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
   sig[8] = s6;
 }
 
-static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
                                    __m256i *sig) {
   // base + 7th row
   __m256i s0 = _mm256_castsi128_si256(
@@ -825,13 +825,13 @@ static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
   sig[8] = s1;
 }
 
-static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
                                      __m256i *y0, __m256i *y1) {
   filter_8x1_pixels(sig, f, y0);
   filter_8x1_pixels(&sig[4], f, y1);
 }
 
-static INLINE void update_pixels(__m256i *sig) {
+static inline void update_pixels(__m256i *sig) {
   int i;
   for (i = 0; i < 3; ++i) {
     sig[i] = sig[i + 1];
@@ -936,7 +936,7 @@ static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
   sig[16] = s8;
 }
 
-static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
                                       __m256i *y0, __m256i *y1) {
   __m256i res[4];
   int i;
@@ -952,7 +952,7 @@ static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
   }
 }
 
-static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
                                      const __m256i *mask, uint16_t *dst,
                                      ptrdiff_t pitch) {
   __m256i p = _mm256_min_epi16(*y0, *mask);
@@ -1132,7 +1132,7 @@ static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
   sig[2] = _mm256_loadu_si256((const __m256i *)src);
 }
 
-static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
                                        __m256i *sig) {
   // load the next row
   const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
@@ -1141,7 +1141,7 @@ static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
   sig[2] = u;
 }
 
-static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
                                          __m256i *y0, __m256i *y1) {
   filter_16_2t_pixels(sig, f, y0, y1);
 }
@@ -1167,7 +1167,7 @@ static void aom_highbd_filter_block1d16_v2_avx2(
   } while (height > 0);
 }
 
-static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
   const __m128i p = _mm_set1_epi32(0x09080706);
   f[0] = _mm_shuffle_epi8(h, p);
@@ -1177,7 +1177,7 @@ static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
   sig[2] = _mm_loadu_si128((const __m128i *)src);
 }
 
-static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
                                           __m128i *sig) {
   // load the next row
   const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
@@ -1186,7 +1186,7 @@ static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
   sig[2] = u;
 }
 
-static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
                                       __m128i *y0, __m128i *y1) {
   const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
   __m128i x0 = _mm_madd_epi16(sig[0], *f);
@@ -1197,7 +1197,7 @@ static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
   *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
 }
 
-static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
                                            const __m128i *mask, uint16_t *dst) {
   __m128i res = _mm_packus_epi32(*y0, *y1);
   res = _mm_min_epi16(res, *mask);
diff --git a/aom_dsp/x86/highbd_intrapred_sse2.c b/aom_dsp/x86/highbd_intrapred_sse2.c
index df8f6725de..388d75fabf 100644
--- a/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -103,7 +103,7 @@ void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
   aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
 }
 
-static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
                                        const __m128i *row) {
   const __m128i val = _mm_unpacklo_epi64(*row, *row);
   _mm_store_si128((__m128i *)*dst, val);
@@ -111,7 +111,7 @@ static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
   *dst += stride;
 }
 
-static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
                                        const __m128i *row) {
   const __m128i val = _mm_unpackhi_epi64(*row, *row);
   _mm_store_si128((__m128i *)(*dst), val);
@@ -119,7 +119,7 @@ static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
   *dst += stride;
 }
 
-static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *left) {
   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
@@ -174,7 +174,7 @@ void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
                                        const __m128i *row) {
   const __m128i val = _mm_unpacklo_epi64(*row, *row);
   _mm_store_si128((__m128i *)(*dst), val);
@@ -184,7 +184,7 @@ static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
   *dst += stride;
 }
 
-static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
                                        const __m128i *row) {
   const __m128i val = _mm_unpackhi_epi64(*row, *row);
   _mm_store_si128((__m128i *)(*dst), val);
@@ -194,7 +194,7 @@ static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
   *dst += stride;
 }
 
-static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *left) {
   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
@@ -246,14 +246,14 @@ void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
 
 // 4x4
 
-static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+static inline __m128i dc_sum_4(const uint16_t *ref) {
   const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
   const __m128i a = _mm_add_epi16(_dcba, _xxdc);
   return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
 }
 
-static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                 const __m128i *dc) {
   const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
   int i;
@@ -297,7 +297,7 @@ void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
 // 4x8
 
-static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
                                 const __m128i *dc) {
   const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
   int i;
@@ -307,7 +307,7 @@ static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
 }
 
 // Shared with DC 8xh
-static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+static inline __m128i dc_sum_8(const uint16_t *ref) {
   const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
   const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
@@ -351,7 +351,7 @@ void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
 // 8xh
 
-static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
                                 const __m128i *dc) {
   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
@@ -364,7 +364,7 @@ static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
 // -----------------------------------------------------------------------------
 // DC_TOP
 
-static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
                                         int height, const uint16_t *above) {
   const __m128i four = _mm_cvtsi32_si128(4);
   const __m128i sum = dc_sum_8(above);
@@ -422,7 +422,7 @@ void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
 }
 
 // Shared with DC 16xh
-static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+static inline __m128i dc_sum_16(const uint16_t *ref) {
   const __m128i sum_lo = dc_sum_8(ref);
   const __m128i sum_hi = dc_sum_8(ref + 8);
   return _mm_add_epi16(sum_lo, sum_hi);
@@ -442,7 +442,7 @@ void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
 // DC_128
 
-static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
                                         int height, int bd) {
   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
@@ -476,7 +476,7 @@ void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
 // 16xh
 
-static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
                                  const __m128i *dc) {
   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
@@ -513,7 +513,7 @@ void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
 }
 
 // Shared with 32xh
-static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+static inline __m128i dc_sum_32(const uint16_t *ref) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sum_a = dc_sum_16(ref);
   const __m128i sum_b = dc_sum_16(ref + 16);
@@ -605,7 +605,7 @@ void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
 // 32xh
 
-static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
                                  const __m128i *dc) {
   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index cc0bcd991d..e7fa53cb98 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -25,7 +25,7 @@ static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 }
 
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+static inline void get_limit(const uint8_t *bl, const uint8_t *l,
                              const uint8_t *t, int bd, __m128i *blt,
                              __m128i *lt, __m128i *thr, __m128i *t80_out) {
   const int shift = bd - 8;
@@ -43,7 +43,7 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
   *t80_out = _mm_set1_epi16(1 << (bd - 1));
 }
 
-static INLINE void get_limit_dual(
+static inline void get_limit_dual(
     const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
     const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
     int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
@@ -71,7 +71,7 @@ static INLINE void get_limit_dual(
   *t80_out = _mm_set1_epi16(1 << (bd - 1));
 }
 
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+static inline void load_highbd_pixel(const uint16_t *s, int size, int pitch,
                                      __m128i *p, __m128i *q) {
   int i;
   for (i = 0; i < size; i++) {
@@ -80,7 +80,7 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
   }
 }
 
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+static inline void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
                                            const __m128i *l, const __m128i *bl,
                                            __m128i *mask) {
   __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
@@ -105,7 +105,7 @@ static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
   *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
 }
 
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+static inline void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
                                                  __m128i *p1p0, __m128i *q1q0,
                                                  __m128i *abs_p1p0, __m128i *l,
                                                  __m128i *bl, __m128i *t,
@@ -154,7 +154,7 @@ static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
   *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
 }
 
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+static inline void flat_mask_internal(const __m128i *th, const __m128i *pq,
                                       int start, int end, __m128i *flat) {
   int i;
   __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
@@ -172,7 +172,7 @@ static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
   *flat = _mm_cmpeq_epi16(ft, zero);
 }
 
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+static inline void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
                                            const __m128i *q, int start, int end,
                                            __m128i *flat) {
   int i;
@@ -191,7 +191,7 @@ static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
   *flat = _mm_cmpeq_epi16(ft, zero);
 }
 
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+static inline void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
                                           __m128i *flat2, int bd) {
   // check the distance 1,2,3 against 0
   __m128i th = _mm_set1_epi16(1);
@@ -200,7 +200,7 @@ static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
   flat_mask_internal(&th, pq, 4, 7, flat2);
 }
 
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+static inline void highbd_flat_mask4_dual_sse2(const __m128i *p,
                                                const __m128i *q, __m128i *flat,
                                                __m128i *flat2, int bd) {
   // check the distance 1,2,3 against 0
@@ -264,7 +264,7 @@ static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
   *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
 }
 
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+static inline void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
                                             __m128i *qs, const __m128i *mask,
                                             const __m128i *th, int bd,
                                             __m128i *t80) {
diff --git a/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index 5ae3f90153..44918f880e 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -15,21 +15,21 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+static inline void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i sign = _mm_srai_epi16(*p, 15);
   const __m128i dc = _mm_unpacklo_epi16(*p, sign);
   const __m128i ac = _mm_unpackhi_epi16(*p, sign);
   *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
 }
 
-static INLINE void update_qp(__m256i *qp) {
+static inline void update_qp(__m256i *qp) {
   int i;
   for (i = 0; i < 5; ++i) {
     qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
   }
 }
 
-static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+static inline void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
                            const int16_t *quant_ptr, const int16_t *dequant_ptr,
                            const int16_t *quant_shift_ptr, __m256i *qp,
                            int log_scale) {
@@ -59,7 +59,7 @@ static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
 // Note:
 // *x is vector multiplied by *y which is 16 int32_t parallel multiplication
 // and right shift 16.  The output, 16 int32_t is save in *p.
-static INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+static inline __m256i mm256_mul_shift_epi32(const __m256i *x,
                                             const __m256i *y) {
   __m256i prod_lo = _mm256_mul_epi32(*x, *y);
   __m256i prod_hi = _mm256_srli_epi64(*x, 32);
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index 68bc928ecb..9eeef08912 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -19,7 +19,7 @@
 #include "aom_ports/mem.h"
 
 // SAD
-static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+static inline unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
   // input 8 32-bit summation
   __m128i lo128, hi128;
   __m256i u = _mm256_srli_si256(*v, 8);
@@ -37,7 +37,7 @@ static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
   return (unsigned int)_mm_cvtsi128_si32(lo128);
 }
 
-static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+static inline void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
                                             __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
@@ -58,7 +58,7 @@ static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
 }
 
 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
-static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+static inline void sad16x4(const uint16_t *src_ptr, int src_stride,
                            const uint16_t *ref_ptr, int ref_stride,
                            const uint16_t *sec_ptr, __m256i *sad_acc) {
   __m256i s[4], r[4];
@@ -560,7 +560,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
 
 // SAD 4D
 // Combine 4 __m256i input vectors  v to uint32_t result[4]
-static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+static inline void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
   const __m256i mask = _mm256_set1_epi64x(~0u);
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index ca45c324fd..c04158a316 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -608,7 +608,7 @@ FNS(sse2)
 #undef FNS
 #undef FN
 
-static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+static inline void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
                                                     const __m128i *w0,
                                                     const __m128i *w1,
                                                     const __m128i *r,
diff --git a/aom_dsp/x86/highbd_variance_sse4.c b/aom_dsp/x86/highbd_variance_sse4.c
index 24bf1daebc..3717c5e7fd 100644
--- a/aom_dsp/x86/highbd_variance_sse4.c
+++ b/aom_dsp/x86/highbd_variance_sse4.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
 
-static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+static inline void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
                                          const uint8_t *b8, int b_stride,
                                          uint64_t *sse, int64_t *sum) {
   __m128i u0, u1, u2, u3;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index e045eab616..7a41618d16 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -16,7 +16,7 @@
 #include "aom_dsp/x86/intrapred_utils.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+static inline __m256i dc_sum_64(const uint8_t *ref) {
   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
   const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
   const __m256i zero = _mm256_setzero_si256();
@@ -29,7 +29,7 @@ static INLINE __m256i dc_sum_64(const uint8_t *ref) {
   return _mm256_add_epi16(y0, u0);
 }
 
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+static inline __m256i dc_sum_32(const uint8_t *ref) {
   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
   const __m256i zero = _mm256_setzero_si256();
   __m256i y = _mm256_sad_epu8(x, zero);
@@ -39,7 +39,7 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) {
   return _mm256_add_epi16(y, u);
 }
 
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
   for (int i = 0; i < height; ++i) {
     _mm256_storeu_si256((__m256i *)dst, *r);
@@ -47,7 +47,7 @@ static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
   }
 }
 
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
                                     int height, uint8_t *dst,
                                     ptrdiff_t stride) {
   for (int i = 0; i < height; ++i) {
@@ -57,7 +57,7 @@ static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
   }
 }
 
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
   for (int i = 0; i < height; ++i) {
     _mm256_storeu_si256((__m256i *)dst, *r);
@@ -135,7 +135,7 @@ static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
 };
 
-static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
 
   r0 = _mm_unpacklo_epi16(x[0], x[1]);
@@ -179,7 +179,7 @@ static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
   d[7] = _mm_unpackhi_epi64(r5, r7);
 }
 
-static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
   __m256i w0, w1, w2, w3, ww0, ww1;
 
   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
@@ -200,7 +200,7 @@ static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
 }
 
-static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
   __m256i w0, w1, w2, w3, ww0, ww1;
 
   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
@@ -238,7 +238,7 @@ static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
   d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
 }
 
-static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
   __m256i w0, w1, w2, w3, ww0, ww1;
   __m256i dd[16];
   w0 = _mm256_unpacklo_epi16(x[0], x[1]);
@@ -378,7 +378,7 @@ void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
 // 0,1,2,3, and 16,17,18,19. The next call would do
 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
 // would finish 32 rows.
-static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
                                         ptrdiff_t stride) {
   __m256i t[4];
   __m256i m = _mm256_setzero_si256();
@@ -712,7 +712,7 @@ void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
 // PAETH_PRED
 
 // Return 16 16-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
                                  const __m256i *topleft) {
   const __m256i base =
       _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
@@ -736,7 +736,7 @@ static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
 }
 
 // Return 16 8-bit pixels in one row (__m128i)
-static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
                                       const __m256i *topleft) {
   const __m256i p0 = paeth_pred(left, top, topleft);
   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
@@ -744,7 +744,7 @@ static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
   return _mm256_castsi256_si128(p);
 }
 
-static INLINE __m256i get_top_vector(const uint8_t *above) {
+static inline __m256i get_top_vector(const uint8_t *above) {
   const __m128i x = _mm_load_si128((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
@@ -772,7 +772,7 @@ void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE __m256i get_left_vector(const uint8_t *left) {
+static inline __m256i get_left_vector(const uint8_t *left) {
   const __m128i x = _mm_load_si128((const __m128i *)left);
   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
 }
@@ -847,7 +847,7 @@ void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
 }
 
 // Return 32 8-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
                                       const __m256i *top1,
                                       const __m256i *topleft) {
   __m256i p0 = paeth_pred(left, top0, topleft);
@@ -4241,7 +4241,7 @@ void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 }
 
 // z3 functions
-static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   __m256i w10, w11, w12, w13, w14, w15;
 
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 98d9b88d0c..16ef4717aa 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -13,7 +13,7 @@
 #include "aom_dsp/x86/intrapred_x86.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
                                 ptrdiff_t stride) {
   for (int i = 0; i < height; i += 2) {
     *(uint32_t *)dst = dc;
@@ -23,7 +23,7 @@ static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
   }
 }
 
-static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
                                 ptrdiff_t stride) {
   int i;
   for (i = 0; i < height; ++i) {
@@ -32,7 +32,7 @@ static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
-static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
                                  ptrdiff_t stride) {
   int i;
   for (i = 0; i < height; ++i) {
@@ -41,7 +41,7 @@ static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
-static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
                                  ptrdiff_t stride) {
   int i;
   for (i = 0; i < height; ++i) {
@@ -51,7 +51,7 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
-static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
                                  ptrdiff_t stride) {
   for (int i = 0; i < height; ++i) {
     _mm_store_si128((__m128i *)dst, *row);
@@ -62,20 +62,20 @@ static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
-static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+static inline __m128i dc_sum_4(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
   x = _mm_unpacklo_epi8(x, zero);
   return _mm_sad_epu8(x, zero);
 }
 
-static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+static inline __m128i dc_sum_8(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
   return _mm_sad_epu8(x, zero);
 }
 
-static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+static inline __m128i dc_sum_64(const uint8_t *ref) {
   __m128i x0 = _mm_load_si128((__m128i const *)ref);
   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
@@ -97,7 +97,7 @@ static INLINE __m128i dc_sum_64(const uint8_t *ref) {
 
 #define DC_SHIFT2 16
 
-static INLINE int divide_using_multiply_shift(int num, int shift1,
+static inline int divide_using_multiply_shift(int num, int shift1,
                                               int multiplier) {
   const int interm = num >> shift1;
   return interm * multiplier >> DC_SHIFT2;
@@ -916,7 +916,7 @@ void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 64, dst, stride);
 }
 
-static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, int height) {
   const __m128i row0 = _mm_load_si128((__m128i const *)above);
   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
@@ -945,7 +945,7 @@ void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
   v_predictor_32xh(dst, stride, above, 64);
 }
 
-static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, int height) {
   const __m128i row0 = _mm_load_si128((__m128i const *)above);
   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
@@ -1091,7 +1091,7 @@ void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
   _mm_storel_epi64((__m128i *)dst, row3);
 }
 
-static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above, const uint8_t *left,
                                       int count) {
   (void)above;
@@ -1167,7 +1167,7 @@ void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
   h_predictor_8x16xc(dst, stride, above, left, 2);
 }
 
-static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
   int i;
   for (i = 0; i < h; ++i) {
@@ -1176,7 +1176,7 @@ static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
   }
 }
 
-static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
   const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
   const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
   const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
@@ -1188,7 +1188,7 @@ static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
   row[3] = _mm_unpacklo_epi64(u3, u3);
 }
 
-static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
   const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
   const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
   const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
@@ -1202,7 +1202,7 @@ static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
 
 // Process 16x8, first 4 rows
 // Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
                                        ptrdiff_t stride) {
   __m128i row[4];
   repeat_low_4pixels(left, row);
@@ -1211,7 +1211,7 @@ static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
 
 // Process 16x8, second 4 rows
 // Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
                                        ptrdiff_t stride) {
   __m128i row[4];
   repeat_high_4pixels(left, row);
@@ -1236,7 +1236,7 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
   h_prediction_16x8_2(&left_col_8p, dst, stride);
 }
 
-static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *left, int count) {
   int i = 0;
   do {
@@ -1270,7 +1270,7 @@ void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
   h_predictor_16xh(dst, stride, left, 4);
 }
 
-static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
   int i;
   for (i = 0; i < h; ++i) {
@@ -1282,7 +1282,7 @@ static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
 
 // Process 32x8, first 4 rows
 // Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
                                        ptrdiff_t stride) {
   __m128i row[4];
   repeat_low_4pixels(left, row);
@@ -1291,7 +1291,7 @@ static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
 
 // Process 32x8, second 4 rows
 // Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
                                        ptrdiff_t stride) {
   __m128i row[4];
   repeat_high_4pixels(left, row);
@@ -1330,7 +1330,7 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   h_prediction_32x8_2(&left_col_8p, dst, stride);
 }
 
-static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
@@ -1360,7 +1360,7 @@ void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
   h_predictor_32xh(dst, stride, left, 64);
 }
 
-static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 320e6b893d..20a149ab25 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -19,7 +19,7 @@
 // PAETH_PRED
 
 // Return 8 16-bit pixels in one row
-static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
                                      const __m128i *topleft) {
   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
 
@@ -189,7 +189,7 @@ void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 }
 
 // Return 16 8-bit pixels in one row
-static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
                                       const __m128i *top1,
                                       const __m128i *topleft) {
   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
@@ -584,7 +584,7 @@ void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[0]: above and below_pred interleave vector
 // pixels[1]: left vector
 // pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
   __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
   if (height == 4)
@@ -607,7 +607,7 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
 // weight_h[2]: same as [0], second half for height = 16 only
 // weight_h[3]: same as [1], second half for height = 16 only
 // weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(int height, __m128i *weight_h,
+static inline void load_weight_w4(int height, __m128i *weight_h,
                                   __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
@@ -630,7 +630,7 @@ static INLINE void load_weight_w4(int height, __m128i *weight_h,
   }
 }
 
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
                                    const __m128i *ww, int h, uint8_t *dst,
                                    ptrdiff_t stride, int second_half) {
   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
@@ -708,7 +708,7 @@ void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[5]: above and below_pred interleave vector, second half
 // pixels[6]: left vector + 16
 // pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
@@ -744,7 +744,7 @@ static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
 // weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(int height, __m128i *weight_h,
+static inline void load_weight_w8(int height, __m128i *weight_h,
                                   __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
   const int we_offset = height < 8 ? 0 : 4;
@@ -786,7 +786,7 @@ static INLINE void load_weight_w8(int height, __m128i *weight_h,
   }
 }
 
-static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
                                    const __m128i *ww, int h, uint8_t *dst,
                                    ptrdiff_t stride, int second_half) {
   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
diff --git a/aom_dsp/x86/intrapred_utils.h b/aom_dsp/x86/intrapred_utils.h
index 1cc38f7175..f8e5265d0f 100644
--- a/aom_dsp/x86/intrapred_utils.h
+++ b/aom_dsp/x86/intrapred_utils.h
@@ -53,7 +53,7 @@ static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
   { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
 };
 
-static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
   __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
   w0 = _mm_unpacklo_epi8(x[0], x[1]);
   w1 = _mm_unpacklo_epi8(x[2], x[3]);
@@ -91,7 +91,7 @@ static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
   d[15] = _mm_srli_si128(d[3], 12);
 }
 
-static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   __m128i w10, w11, w12, w13, w14, w15;
 
diff --git a/aom_dsp/x86/intrapred_x86.h b/aom_dsp/x86/intrapred_x86.h
index f0b3ec6614..d545e439d2 100644
--- a/aom_dsp/x86/intrapred_x86.h
+++ b/aom_dsp/x86/intrapred_x86.h
@@ -16,7 +16,7 @@
 #include "aom/aom_integer.h"
 #include "config/aom_config.h"
 
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+static inline __m128i dc_sum_16_sse2(const uint8_t *ref) {
   __m128i x = _mm_load_si128((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
   x = _mm_sad_epu8(x, zero);
@@ -24,7 +24,7 @@ static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
   return _mm_add_epi16(x, high);
 }
 
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+static inline __m128i dc_sum_32_sse2(const uint8_t *ref) {
   __m128i x0 = _mm_load_si128((__m128i const *)ref);
   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   const __m128i zero = _mm_setzero_si128();
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index 5ca896be93..07b91af0e7 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/variance_impl_ssse3.h"
 
-static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+static inline void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
                                         void *const result) {
   __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
diff --git a/aom_dsp/x86/loopfilter_avx2.c b/aom_dsp/x86/loopfilter_avx2.c
index bfcde46419..d9bf50e441 100644
--- a/aom_dsp/x86/loopfilter_avx2.c
+++ b/aom_dsp/x86/loopfilter_avx2.c
@@ -472,7 +472,7 @@ void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
   }
 }
 
-static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
+static inline void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
                                                 unsigned char *out, int out_p,
                                                 int is_store_avx2) {
   const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 3b3f56c61a..580ff59116 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -18,14 +18,14 @@
 #include "aom_ports/emmintrin_compat.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+static inline __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
 // them to 4x8  independently while flipping the second matrix horizontally.
 // Used for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+static inline void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
                                         __m128i *x3, __m128i *q0p0,
                                         __m128i *q1p1, __m128i *q2p2,
                                         __m128i *q3p3, __m128i *q4p4,
@@ -84,7 +84,7 @@ static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
 // them  independently while flipping the second matrix horizontaly  Used for 14
 // taps filter pq pairs inverse
-static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+static inline void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
                                             __m128i *x2, __m128i *x3,
                                             __m128i *x4, __m128i *x5,
                                             __m128i *x6, __m128i *x7,
@@ -386,7 +386,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   xx_storel_32(s + 3 * p - 2, d3);
 }
 
-static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+static inline void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
   xx_storel_32(s - (num + 1) * p, x);
   xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
 }
@@ -2099,7 +2099,7 @@ void aom_lpf_vertical_14_dual_sse2(
   _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
 }
 
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+static inline __m128i filter_add2_sub2(const __m128i *const total,
                                        const __m128i *const a1,
                                        const __m128i *const a2,
                                        const __m128i *const s1,
@@ -2109,7 +2109,7 @@ static INLINE __m128i filter_add2_sub2(const __m128i *const total,
   return x;
 }
 
-static INLINE __m128i filter8_mask(const __m128i *const flat,
+static inline __m128i filter8_mask(const __m128i *const flat,
                                    const __m128i *const other_filt,
                                    const __m128i *const f8_lo,
                                    const __m128i *const f8_hi) {
@@ -2119,7 +2119,7 @@ static INLINE __m128i filter8_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-static INLINE __m128i filter16_mask(const __m128i *const flat,
+static inline __m128i filter16_mask(const __m128i *const flat,
                                     const __m128i *const other_filt,
                                     const __m128i *const f_lo,
                                     const __m128i *const f_hi) {
diff --git a/aom_dsp/x86/lpf_common_sse2.h b/aom_dsp/x86/lpf_common_sse2.h
index 6be06d5227..b5b2c38e49 100644
--- a/aom_dsp/x86/lpf_common_sse2.h
+++ b/aom_dsp/x86/lpf_common_sse2.h
@@ -19,7 +19,7 @@
 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
 
-static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
                                             __m128i *x2, __m128i *x3,
                                             __m128i *x4, __m128i *x5,
                                             __m128i *d0, __m128i *d1,
@@ -59,7 +59,7 @@ static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
                            _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
 }
 
-static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
                                                     __m128i *x2, __m128i *x3,
                                                     __m128i *d0, __m128i *d1,
                                                     __m128i *d2, __m128i *d3) {
@@ -78,7 +78,7 @@ static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
   *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
 }
 
-static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
                                                      __m128i *x2, __m128i *x3,
                                                      __m128i *d4, __m128i *d5,
                                                      __m128i *d6, __m128i *d7) {
@@ -99,7 +99,7 @@ static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
 
 // here in and out pointers (x and d) should be different! we don't store their
 // values inside
-static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
                                                 __m128i *x2, __m128i *x3,
                                                 __m128i *d0, __m128i *d1,
                                                 __m128i *d2, __m128i *d3,
@@ -123,7 +123,7 @@ static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
   highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
 }
 
-static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
                                                 __m128i *x2, __m128i *x3,
                                                 __m128i *x4, __m128i *x5,
                                                 __m128i *x6, __m128i *x7,
@@ -157,7 +157,7 @@ static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
   *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
 }
 
-static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
                                                  __m128i *x2, __m128i *x3,
                                                  __m128i *x4, __m128i *x5,
                                                  __m128i *x6, __m128i *x7,
@@ -192,7 +192,7 @@ static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
 
 // here in and out pointers (x and d) should be different! we don't store their
 // values inside
-static INLINE void highbd_transpose8x8_sse2(
+static inline void highbd_transpose8x8_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
@@ -203,7 +203,7 @@ static INLINE void highbd_transpose8x8_sse2(
 
 // here in and out pointers (x and d arrays) should be different! we don't store
 // their values inside
-static INLINE void highbd_transpose8x16_sse2(
+static inline void highbd_transpose8x16_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
@@ -216,7 +216,7 @@ static INLINE void highbd_transpose8x16_sse2(
 }
 
 // Low bit depth functions
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
                                              __m128i *x2, __m128i *x3,
                                              __m128i *d0, __m128i *d1,
                                              __m128i *d2, __m128i *d3) {
@@ -249,7 +249,7 @@ static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
 }
 
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
                                          __m128i *x3, __m128i *d0, __m128i *d1,
                                          __m128i *d2, __m128i *d3, __m128i *d4,
                                          __m128i *d5, __m128i *d6,
@@ -298,7 +298,7 @@ static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
                        12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
 }
 
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
                                          __m128i *x3, __m128i *x4, __m128i *x5,
                                          __m128i *x6, __m128i *x7, __m128i *d0,
                                          __m128i *d1, __m128i *d2,
@@ -345,7 +345,7 @@ static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
   *d3 = _mm_srli_si128(*d2, 8);
 }
 
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
                                      __m128i *x3, __m128i *x4, __m128i *x5,
                                      __m128i *x6, __m128i *x7, __m128i *d0d1,
                                      __m128i *d2d3, __m128i *d4d5,
@@ -392,7 +392,7 @@ static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
       w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
 }
 
-static INLINE void transpose16x8_8x16_sse2(
+static inline void transpose16x8_8x16_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
     __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
@@ -444,7 +444,7 @@ static INLINE void transpose16x8_8x16_sse2(
   *d7 = _mm_unpackhi_epi64(w7, w15);
 }
 
-static INLINE void transpose8x16_16x8_sse2(
+static inline void transpose8x16_16x8_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
     __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
@@ -495,7 +495,7 @@ static INLINE void transpose8x16_16x8_sse2(
   *d14d15 = _mm_unpackhi_epi64(w7, w15);
 }
 
-static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
+static inline void transpose_16x8(unsigned char *in0, unsigned char *in1,
                                   int in_p, unsigned char *out, int out_p) {
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
@@ -564,7 +564,7 @@ static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }
 
-static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
+static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p,
                                           unsigned char *dst, int out_p) {
   // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
   // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
@@ -652,7 +652,7 @@ static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
   mm_storehu(dst + (15 * out_p), x_s37);
 }
 
-static INLINE void transpose_8xn(unsigned char *src[], int in_p,
+static inline void transpose_8xn(unsigned char *src[], int in_p,
                                  unsigned char *dst[], int out_p,
                                  int num_8x8_to_transpose) {
   int idx8x8 = 0;
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index 08847c8154..2e69f957d5 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -36,7 +36,7 @@
   pred = _mm_packus_epi16(pred_l, pred_r);                    \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
-static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+static inline void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *a_ptr[4], int a_stride,
                                        const uint8_t *b_ptr, int b_stride,
                                        const uint8_t *m_ptr, int m_stride,
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index 9bc79d8022..44cb552114 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
-static INLINE unsigned int masked_sad32xh_avx2(
+static inline unsigned int masked_sad32xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
     int width, int height) {
@@ -68,7 +68,7 @@ static INLINE unsigned int masked_sad32xh_avx2(
   return sad;
 }
 
-static INLINE unsigned int masked_sad16xh_avx2(
+static inline unsigned int masked_sad16xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
     int height) {
@@ -114,7 +114,7 @@ static INLINE unsigned int masked_sad16xh_avx2(
   return sad;
 }
 
-static INLINE unsigned int aom_masked_sad_avx2(
+static inline unsigned int aom_masked_sad_avx2(
     const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
     const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
     int invert_mask, int m, int n) {
@@ -197,7 +197,7 @@ MASKSADMXN_AVX2(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE unsigned int highbd_masked_sad8xh_avx2(
+static inline unsigned int highbd_masked_sad8xh_avx2(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     int height) {
@@ -254,7 +254,7 @@ static INLINE unsigned int highbd_masked_sad8xh_avx2(
   return sad;
 }
 
-static INLINE unsigned int highbd_masked_sad16xh_avx2(
+static inline unsigned int highbd_masked_sad16xh_avx2(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     int width, int height) {
@@ -312,7 +312,7 @@ static INLINE unsigned int highbd_masked_sad16xh_avx2(
   return sad;
 }
 
-static INLINE unsigned int aom_highbd_masked_sad_avx2(
+static inline unsigned int aom_highbd_masked_sad_avx2(
     const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
     const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
     int invert_mask, int m, int n) {
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 9fa5b58d10..73d86d32bc 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -22,7 +22,7 @@
 #include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 // For width a multiple of 16
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
@@ -94,7 +94,7 @@ MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
                                             const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
@@ -230,7 +230,7 @@ unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // For width a multiple of 8
-static INLINE unsigned int highbd_masked_sad_ssse3(
+static inline unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     int width, int height);
@@ -289,7 +289,7 @@ HIGHBD_MASKSADMXN_SSSE3(16, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE unsigned int highbd_masked_sad_ssse3(
+static inline unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     int width, int height) {
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 81c40cdfc0..63937936fd 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -136,7 +136,7 @@ MASK_SUBPIX_VAR_SSSE3(64, 16)
 MASK_SUBPIX_VAR_SSSE3(16, 64)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+static inline __m128i filter_block(const __m128i a, const __m128i b,
                                    const __m128i filter) {
   __m128i v0 = _mm_unpacklo_epi8(a, b);
   v0 = _mm_maddubs_epi16(v0, filter);
@@ -221,7 +221,7 @@ static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
   }
 }
 
-static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+static inline __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
                                          const __m128i *a1, const __m128i *b1,
                                          const __m128i *filter) {
   __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
@@ -395,7 +395,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
   }
 }
 
-static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+static inline void accumulate_block(const __m128i *src, const __m128i *a,
                                     const __m128i *b, const __m128i *m,
                                     __m128i *sum, __m128i *sum_sq) {
   const __m128i zero = _mm_setzero_si128();
@@ -717,7 +717,7 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+static inline __m128i highbd_filter_block(const __m128i a, const __m128i b,
                                           const __m128i filter) {
   __m128i v0 = _mm_unpacklo_epi16(a, b);
   v0 = _mm_madd_epi16(v0, filter);
@@ -803,7 +803,7 @@ static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
   }
 }
 
-static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+static inline __m128i highbd_filter_block_2rows(const __m128i *a0,
                                                 const __m128i *b0,
                                                 const __m128i *a1,
                                                 const __m128i *b1,
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.h b/aom_dsp/x86/masked_variance_intrin_ssse3.h
index c25e5b8523..78c3bdcfea 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -21,7 +21,7 @@
 
 #include "aom_dsp/blend.h"
 
-static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+static inline void comp_mask_pred_16_ssse3(const uint8_t *src0,
                                            const uint8_t *src1,
                                            const uint8_t *mask, uint8_t *dst) {
   const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
@@ -47,7 +47,7 @@ static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
   _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
 }
 
-static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+static inline void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
                                           const uint8_t *src0, int stride0,
                                           const uint8_t *src1, int stride1,
                                           const uint8_t *mask,
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 343f12f555..c571358896 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -19,34 +19,34 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE int16_t loadu_int16(const void *src) {
+static inline int16_t loadu_int16(const void *src) {
   int16_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE int32_t loadu_int32(const void *src) {
+static inline int32_t loadu_int32(const void *src) {
   int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE int64_t loadu_int64(const void *src) {
+static inline int64_t loadu_int64(const void *src) {
   int64_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
   _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
 }
 
-static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
   return _mm_castps_si128(
       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
 }
 
-static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
   return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
                         loadu_int32((int8_t *)src + 1 * byte_stride),
@@ -54,7 +54,7 @@ static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                         loadu_int32((int8_t *)src + 3 * byte_stride));
 }
 
-static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
   __m128i dst;
   dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
@@ -62,7 +62,7 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
   return dst;
 }
 
-static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+static inline void store_8bit_8x4_from_16x2(const __m128i *const s,
                                             uint8_t *const d,
                                             const ptrdiff_t stride) {
   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
@@ -71,7 +71,7 @@ static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
   _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
 }
 
-static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
                                   const ptrdiff_t stride) {
   *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
   *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
@@ -79,7 +79,7 @@ static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
   *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
 }
 
-static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
                                        const ptrdiff_t stride) {
   __m128i ss[4];
 
@@ -90,7 +90,7 @@ static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
   store_8bit_4x4(ss, d, stride);
 }
 
-static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
   d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
@@ -98,13 +98,13 @@ static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
   d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
 }
 
-static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   load_8bit_4x4(s + 0 * stride, stride, &d[0]);
   load_8bit_4x4(s + 4 * stride, stride, &d[4]);
 }
 
-static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
   d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
@@ -112,7 +112,7 @@ static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
   d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
 }
 
-static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+static inline void loadu_8bit_16x4(const uint8_t *const s,
                                    const ptrdiff_t stride, __m128i *const d) {
   d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
   d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
@@ -120,13 +120,13 @@ static INLINE void loadu_8bit_16x4(const uint8_t *const s,
   d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
 }
 
-static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   load_8bit_8x4(s + 0 * stride, stride, &d[0]);
   load_8bit_8x4(s + 4 * stride, stride, &d[4]);
 }
 
-static INLINE void load_8bit_16x8(const uint8_t *const s,
+static inline void load_8bit_16x8(const uint8_t *const s,
                                   const ptrdiff_t stride, __m128i *const d) {
   d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
   d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
@@ -138,13 +138,13 @@ static INLINE void load_8bit_16x8(const uint8_t *const s,
   d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
 }
 
-static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+static inline void loadu_8bit_16x8(const uint8_t *const s,
                                    const ptrdiff_t stride, __m128i *const d) {
   loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
   loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
 }
 
-static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
                                   const ptrdiff_t stride) {
   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
   _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
@@ -156,7 +156,7 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
   _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
 }
 
-static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
                                     const ptrdiff_t stride) {
   _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
   _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index 0962e75e04..6083262e3b 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -17,7 +17,7 @@
 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+static inline void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
                                     const int32_t *wsrc, const int32_t *mask,
                                     unsigned int *const sse, int *const sum,
                                     const int h) {
diff --git a/aom_dsp/x86/obmc_intrinsic_ssse3.h b/aom_dsp/x86/obmc_intrinsic_ssse3.h
index 8a4af4bc54..44dec9987c 100644
--- a/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -16,13 +16,13 @@
 
 #include "config/aom_config.h"
 
-static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+static inline int32_t xx_hsum_epi32_si32(__m128i v_d) {
   v_d = _mm_hadd_epi32(v_d, v_d);
   v_d = _mm_hadd_epi32(v_d, v_d);
   return _mm_cvtsi128_si32(v_d);
 }
 
-static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+static inline int64_t xx_hsum_epi64_si64(__m128i v_q) {
   v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
 #if AOM_ARCH_X86_64
   return _mm_cvtsi128_si64(v_q);
@@ -35,7 +35,7 @@ static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
 #endif
 }
 
-static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+static inline int64_t xx_hsum_epi32_si64(__m128i v_d) {
   const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
   const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
   const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
@@ -43,7 +43,7 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
 }
 
 // This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+static inline __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
   const __m128i v_tmp_d =
diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c
index fcedecf5e2..0cab468771 100644
--- a/aom_dsp/x86/obmc_sad_avx2.c
+++ b/aom_dsp/x86/obmc_sad_avx2.c
@@ -26,7 +26,7 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+static inline unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
                                             const int pre_stride,
                                             const int32_t *wsrc,
                                             const int32_t *mask,
@@ -68,7 +68,7 @@ static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
   return xx_hsum_epi32_si32(v_sad_d_0);
 }
 
-static INLINE unsigned int obmc_sad_w8n_avx2(
+static inline unsigned int obmc_sad_w8n_avx2(
     const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, const int width, const int height) {
   const int pre_step = pre_stride - width;
@@ -149,7 +149,7 @@ OBMCSADWXH(64, 16)
 ////////////////////////////////////////////////////////////////////////////////
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+static inline unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
                                                 const int pre_stride,
                                                 const int32_t *wsrc,
                                                 const int32_t *mask,
@@ -193,7 +193,7 @@ static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
   return xx_hsum_epi32_si32(v_sad_d_0);
 }
 
-static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+static inline unsigned int hbd_obmc_sad_w8n_avx2(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, const int width, const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
diff --git a/aom_dsp/x86/obmc_variance_avx2.c b/aom_dsp/x86/obmc_variance_avx2.c
index e33238556a..27e56bf388 100644
--- a/aom_dsp/x86/obmc_variance_avx2.c
+++ b/aom_dsp/x86/obmc_variance_avx2.c
@@ -26,7 +26,7 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+static inline void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
                                      const int32_t *wsrc, const int32_t *mask,
                                      unsigned int *const sse, int *const sum,
                                      const int w, const int h) {
@@ -81,7 +81,7 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
   *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
 }
 
-static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+static inline void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
                                       const int32_t *wsrc, const int32_t *mask,
                                       unsigned int *const sse, int *const sum,
                                       const int w, const int h) {
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index 8e55ed0695..aa266e4ff8 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -28,7 +28,7 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+static inline void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
                                      const int32_t *wsrc, const int32_t *mask,
                                      unsigned int *const sse, int *const sum,
                                      const int w, const int h) {
@@ -159,7 +159,7 @@ OBMC_SUBPIX_VAR(64, 16)
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void hbd_obmc_variance_w4(
+static inline void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
@@ -198,7 +198,7 @@ static INLINE void hbd_obmc_variance_w4(
   *sse = xx_hsum_epi32_si32(v_sse_d);
 }
 
-static INLINE void hbd_obmc_variance_w8n(
+static inline void hbd_obmc_variance_w8n(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
     const int h) {
@@ -250,7 +250,7 @@ static INLINE void hbd_obmc_variance_w8n(
   *sse += xx_hsum_epi32_si64(v_sse_d);
 }
 
-static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
                                           unsigned int *sse, int *sum) {
@@ -265,7 +265,7 @@ static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)sse64;
 }
 
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
@@ -291,7 +291,7 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
diff --git a/aom_dsp/x86/quantize_avx2.c b/aom_dsp/x86/quantize_avx2.c
index ef9a0fdb62..3fbbc1f396 100644
--- a/aom_dsp/x86/quantize_avx2.c
+++ b/aom_dsp/x86/quantize_avx2.c
@@ -14,7 +14,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+static inline void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
                                       const int16_t *round_ptr, __m256i *round,
                                       const int16_t *quant_ptr, __m256i *quant,
                                       const int16_t *dequant_ptr,
@@ -49,13 +49,13 @@ static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
   *shift = _mm256_permute4x64_epi64(*shift, 0x54);
 }
 
-static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
   const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
   const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
   return _mm256_packs_epi32(coeff1, coeff2);
 }
 
-static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+static inline void store_coefficients_avx2(__m256i coeff_vals,
                                            tran_low_t *coeff_ptr) {
   __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
   __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
@@ -97,7 +97,7 @@ static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16(
   return v_nz_mask;
 }
 
-static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
+static inline __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
                                        __m256i v_mask) {
   const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
   const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
@@ -106,7 +106,7 @@ static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
   return _mm256_max_epi16(v_eobmax, v_nz_iscan);
 }
 
-static INLINE int16_t accumulate_eob256(__m256i eob256) {
+static inline int16_t accumulate_eob256(__m256i eob256) {
   const __m128i eob_lo = _mm256_castsi256_si128(eob256);
   const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
   __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
diff --git a/aom_dsp/x86/quantize_ssse3.c b/aom_dsp/x86/quantize_ssse3.c
index ce7adca17b..fc794e5574 100644
--- a/aom_dsp/x86/quantize_ssse3.c
+++ b/aom_dsp/x86/quantize_ssse3.c
@@ -19,7 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+static inline void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
                                           const __m128i quant,
                                           const __m128i *shift) {
   __m128i tmp, qcoeff, tmp1;
@@ -33,7 +33,7 @@ static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
   *coeff = _mm_or_si128(tmp, tmp1);
 }
 
-static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+static inline void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
                                                      const __m128i dequant,
                                                      const __m128i zero,
                                                      tran_low_t *dqcoeff) {
diff --git a/aom_dsp/x86/quantize_x86.h b/aom_dsp/x86/quantize_x86.h
index a795e281cc..217989e7ab 100644
--- a/aom_dsp/x86/quantize_x86.h
+++ b/aom_dsp/x86/quantize_x86.h
@@ -13,7 +13,7 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+static inline void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
                                  const int16_t *quant_ptr, __m128i *quant,
                                  const int16_t *dequant_ptr, __m128i *dequant,
@@ -27,17 +27,17 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
 }
 
 // With ssse3 and later abs() and sign() are preferred.
-static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+static inline __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
   return _mm_sub_epi16(a, sign);
 }
 
-static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+static inline __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
   return _mm_sub_epi32(a, sign);
 }
 
-static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+static inline void calculate_qcoeff(__m128i *coeff, const __m128i round,
                                     const __m128i quant, const __m128i shift) {
   __m128i tmp, qcoeff;
   qcoeff = _mm_adds_epi16(*coeff, round);
@@ -46,7 +46,7 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
   *coeff = _mm_mulhi_epi16(qcoeff, shift);
 }
 
-static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+static inline void calculate_qcoeff_log_scale(__m128i *coeff,
                                               const __m128i round,
                                               const __m128i quant,
                                               const __m128i *shift,
@@ -62,11 +62,11 @@ static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
   *coeff = _mm_or_si128(tmp, tmp1);
 }
 
-static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+static inline __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
   return _mm_mullo_epi16(qcoeff, dequant);
 }
 
-static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+static inline void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
                                                          __m128i dequant,
                                                          const __m128i zero,
                                                          tran_low_t *dqcoeff,
@@ -95,7 +95,7 @@ static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
 
 // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
 // to zbin to add 1 to the index in 'scan'.
-static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+static inline __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
                                    const __m128i zbin_mask0,
                                    const __m128i zbin_mask1,
                                    const int16_t *scan_ptr, const int index,
@@ -113,7 +113,7 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
   return _mm_max_epi16(eob0, eob1);
 }
 
-static INLINE int16_t accumulate_eob(__m128i eob) {
+static inline int16_t accumulate_eob(__m128i eob) {
   __m128i eob_shuffled;
   eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
   eob = _mm_max_epi16(eob, eob_shuffled);
@@ -124,14 +124,14 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
   return _mm_extract_epi16(eob, 1);
 }
 
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+static inline __m128i load_coefficients(const tran_low_t *coeff_ptr) {
   assert(sizeof(tran_low_t) == 4);
   const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
   const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
   return _mm_packs_epi32(coeff1, coeff2);
 }
 
-static INLINE void store_coefficients(__m128i coeff_vals,
+static inline void store_coefficients(__m128i coeff_vals,
                                       tran_low_t *coeff_ptr) {
   assert(sizeof(tran_low_t) == 4);
 
@@ -144,7 +144,7 @@ static INLINE void store_coefficients(__m128i coeff_vals,
   _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
 }
 
-static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+static inline void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
                                 const int16_t *iscan_ptr, int *is_found,
                                 __m128i *mask) {
   __m128i all_zero;
@@ -161,7 +161,7 @@ static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
   *mask = _mm_max_epi16(temp_mask, *mask);
 }
 
-static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+static inline void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
                                 __m128i *threshold, const int16_t *iscan_ptr,
                                 int *is_found, __m128i *mask) {
   __m128i zero = _mm_setzero_si128();
@@ -187,7 +187,7 @@ static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
   update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
 }
 
-static INLINE int calculate_non_zero_count(__m128i mask) {
+static inline int calculate_non_zero_count(__m128i mask) {
   __m128i mask0, mask1;
   int non_zero_count = 0;
   mask0 = _mm_unpackhi_epi64(mask, mask);
diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index 9a6c24a6e3..f19ff05774 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c
@@ -14,7 +14,7 @@
 
 #include "aom_ports/mem.h"
 
-static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
   int i;
@@ -42,7 +42,7 @@ static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
   return res;
 }
 
-static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
   int i;
diff --git a/aom_dsp/x86/sse_avx2.c b/aom_dsp/x86/sse_avx2.c
index 6bcca06990..35d3445ae9 100644
--- a/aom_dsp/x86/sse_avx2.c
+++ b/aom_dsp/x86/sse_avx2.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
-static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+static inline void sse_w32_avx2(__m256i *sum, const uint8_t *a,
                                 const uint8_t *b) {
   const __m256i v_a0 = yy_loadu_256(a);
   const __m256i v_b0 = yy_loadu_256(b);
@@ -34,7 +34,7 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
 }
 
-static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+static inline int64_t summary_all_avx2(const __m256i *sum_all) {
   int64_t sum;
   __m256i zero = _mm256_setzero_si256();
   const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
@@ -48,7 +48,7 @@ static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+static inline void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
   const __m256i sum0_4x64 =
       _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
   const __m256i sum1_4x64 =
@@ -57,7 +57,7 @@ static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
   *sum = _mm256_add_epi64(*sum, sum_4x64);
 }
 
-static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+static inline int64_t summary_4x64_avx2(const __m256i sum_4x64) {
   int64_t sum;
   const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
                                          _mm256_extracti128_si256(sum_4x64, 1));
@@ -68,7 +68,7 @@ static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
 }
 #endif
 
-static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+static inline void sse_w4x4_avx2(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m256i *sum) {
   const __m128i v_a0 = xx_loadl_32(a);
   const __m128i v_a1 = xx_loadl_32(a + a_stride);
@@ -88,7 +88,7 @@ static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
 
-static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+static inline void sse_w8x2_avx2(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m256i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
   const __m128i v_a1 = xx_loadl_64(a + a_stride);
@@ -218,7 +218,7 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+static inline void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
                                        const uint16_t *b) {
   const __m256i v_a_w = yy_loadu_256(a);
   const __m256i v_b_w = yy_loadu_256(b);
@@ -226,7 +226,7 @@ static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
 
-static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+static inline void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
                                         int a_stride, const uint16_t *b,
                                         int b_stride) {
   const __m128i v_a0 = xx_loadl_64(a);
@@ -245,7 +245,7 @@ static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
 
-static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+static inline void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
                                         int a_stride, const uint16_t *b,
                                         int b_stride) {
   const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
diff --git a/aom_dsp/x86/sse_sse4.c b/aom_dsp/x86/sse_sse4.c
index e875d56f8f..219441ae09 100644
--- a/aom_dsp/x86/sse_sse4.c
+++ b/aom_dsp/x86/sse_sse4.c
@@ -19,7 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+static inline int64_t summary_all_sse4(const __m128i *sum_all) {
   int64_t sum;
   const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
@@ -30,7 +30,7 @@ static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+static inline void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
   const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
   *sum64 = _mm_add_epi64(sum0, *sum64);
@@ -38,7 +38,7 @@ static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
 }
 #endif
 
-static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+static inline void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
                                   const uint8_t *b) {
   const __m128i v_a0 = xx_loadu_128(a);
   const __m128i v_b0 = xx_loadu_128(b);
@@ -52,7 +52,7 @@ static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
 }
 
-static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+static inline void sse4x2_sse4_1(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m128i *sum) {
   const __m128i v_a0 = xx_loadl_32(a);
   const __m128i v_a1 = xx_loadl_32(a + a_stride);
@@ -64,7 +64,7 @@ static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
 }
 
-static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+static inline void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
                                __m128i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
   const __m128i v_b0 = xx_loadl_64(b);
@@ -179,7 +179,7 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+static inline void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
                                           int a_stride, const uint16_t *b,
                                           int b_stride) {
   const __m128i v_a0 = xx_loadl_64(a);
@@ -192,7 +192,7 @@ static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
 }
 
-static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+static inline void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
                                         const uint16_t *b) {
   const __m128i v_a_w = xx_loadu_128(a);
   const __m128i v_b_w = xx_loadu_128(b);
diff --git a/aom_dsp/x86/subtract_avx2.c b/aom_dsp/x86/subtract_avx2.c
index 4684206cd4..fec954f65e 100644
--- a/aom_dsp/x86/subtract_avx2.c
+++ b/aom_dsp/x86/subtract_avx2.c
@@ -12,7 +12,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+static inline void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
                                    const uint8_t *pred_ptr) {
   __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
   __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
@@ -27,7 +27,7 @@ static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
                      _mm256_permute2x128_si256(diff0, diff1, 0x31));
 }
 
-static INLINE void subtract_block_16xn_avx2(
+static inline void subtract_block_16xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -43,7 +43,7 @@ static INLINE void subtract_block_16xn_avx2(
   }
 }
 
-static INLINE void subtract_block_32xn_avx2(
+static inline void subtract_block_32xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -54,7 +54,7 @@ static INLINE void subtract_block_32xn_avx2(
   }
 }
 
-static INLINE void subtract_block_64xn_avx2(
+static inline void subtract_block_64xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -66,7 +66,7 @@ static INLINE void subtract_block_64xn_avx2(
   }
 }
 
-static INLINE void subtract_block_128xn_avx2(
+static inline void subtract_block_128xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index fe3a435cf4..1f8ef656ed 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -17,12 +17,12 @@
 #include "aom_dsp/x86/sum_squares_sse2.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+static inline __m128i xx_loadh_64(__m128i a, const void *b) {
   const __m128d ad = _mm_castsi128_pd(a);
   return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
 }
 
-static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+static inline uint64_t xx_cvtsi128_si64(__m128i a) {
 #if AOM_ARCH_X86_64
   return (uint64_t)_mm_cvtsi128_si64(a);
 #else
@@ -34,7 +34,7 @@ static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
 #endif
 }
 
-static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+static inline __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
   const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
   const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
   const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index ddaa4fea5f..0f829821a9 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -28,46 +28,46 @@
 
 // Loads and stores to do away with the tedium of casting the address
 // to the right type.
-static INLINE __m128i xx_loadl_32(const void *a) {
+static inline __m128i xx_loadl_32(const void *a) {
   int val;
   memcpy(&val, a, sizeof(val));
   return _mm_cvtsi32_si128(val);
 }
 
-static INLINE __m128i xx_loadl_64(const void *a) {
+static inline __m128i xx_loadl_64(const void *a) {
   return _mm_loadl_epi64((const __m128i *)a);
 }
 
-static INLINE __m128i xx_load_128(const void *a) {
+static inline __m128i xx_load_128(const void *a) {
   return _mm_load_si128((const __m128i *)a);
 }
 
-static INLINE __m128i xx_loadu_128(const void *a) {
+static inline __m128i xx_loadu_128(const void *a) {
   return _mm_loadu_si128((const __m128i *)a);
 }
 
 // Load 64 bits from each of hi and low, and pack into an SSE register
 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
 // the strict aliasing rule, this takes a different approach
-static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
   return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
                             _mm_loadl_epi64((const __m128i *)hi));
 }
 
-static INLINE void xx_storel_32(void *const a, const __m128i v) {
+static inline void xx_storel_32(void *const a, const __m128i v) {
   const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
 }
 
-static INLINE void xx_storel_64(void *const a, const __m128i v) {
+static inline void xx_storel_64(void *const a, const __m128i v) {
   _mm_storel_epi64((__m128i *)a, v);
 }
 
-static INLINE void xx_store_128(void *const a, const __m128i v) {
+static inline void xx_store_128(void *const a, const __m128i v) {
   _mm_store_si128((__m128i *)a, v);
 }
 
-static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+static inline void xx_storeu_128(void *const a, const __m128i v) {
   _mm_storeu_si128((__m128i *)a, v);
 }
 
@@ -77,39 +77,39 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
 //
 // This is useful for rearranging filter kernels for use with the _mm_madd_epi16
 // instruction
-static INLINE __m128i xx_set2_epi16(int16_t a, int16_t b) {
+static inline __m128i xx_set2_epi16(int16_t a, int16_t b) {
   return _mm_setr_epi16(a, b, a, b, a, b, a, b);
 }
 
-static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+static inline __m128i xx_round_epu16(__m128i v_val_w) {
   return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
 }
 
-static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
 }
 
-static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+static inline __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
   return _mm_srli_epi32(v_tmp_d, bits);
 }
 
-static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+static inline __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
   const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
   return _mm_srai_epi16(v_tmp_d, bits);
 }
 
 // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+static inline __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
-static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+static inline __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
   const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
   const __m128i v_tmp_d =
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 53f5028acc..20e6a4b23a 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -27,19 +27,19 @@
 
 // Loads and stores to do away with the tedium of casting the address
 // to the right type.
-static INLINE __m256i yy_load_256(const void *a) {
+static inline __m256i yy_load_256(const void *a) {
   return _mm256_load_si256((const __m256i *)a);
 }
 
-static INLINE __m256i yy_loadu_256(const void *a) {
+static inline __m256i yy_loadu_256(const void *a) {
   return _mm256_loadu_si256((const __m256i *)a);
 }
 
-static INLINE void yy_store_256(void *const a, const __m256i v) {
+static inline void yy_store_256(void *const a, const __m256i v) {
   _mm256_store_si256((__m256i *)a, v);
 }
 
-static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+static inline void yy_storeu_256(void *const a, const __m256i v) {
   _mm256_storeu_si256((__m256i *)a, v);
 }
 
@@ -49,21 +49,21 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) {
 //
 // This is useful for rearranging filter kernels for use with the _mm_madd_epi16
 // instruction
-static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
+static inline __m256i yy_set2_epi16(int16_t a, int16_t b) {
   return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
 }
 
 // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
 // therefore define an equivalent function using a different intrinsic.
 // ([ hi ], [ lo ]) -> [ hi ][ lo ]
-static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
 // This behaves similarly to _mm256_set_epi64x(), but avoids undefined
 // sanitizer warnings when loading values from unaligned buffers using
 // `*(int64_t *)val`.
-static INLINE __m256i yy_loadu_4x64(const void *e3, const void *e2,
+static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
                                     const void *e1, const void *e0) {
   __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
   __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
@@ -76,18 +76,18 @@ static INLINE __m256i yy_loadu_4x64(const void *e3, const void *e2,
   return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
 }
 
-static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   return yy_set_m128i(mhi, mlo);
 }
 
-static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
   _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
   _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
 }
 
-static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
   const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
   return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
 }
diff --git a/aom_dsp/x86/transpose_sse2.h b/aom_dsp/x86/transpose_sse2.h
index dbf476f4cc..31cc37e1ed 100644
--- a/aom_dsp/x86/transpose_sse2.h
+++ b/aom_dsp/x86/transpose_sse2.h
@@ -16,7 +16,7 @@
 
 #include "config/aom_config.h"
 
-static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+static inline __m128i transpose_8bit_4x4(const __m128i *const in) {
   // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03
   // in[1]: 10 11 12 13
@@ -33,7 +33,7 @@ static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
   return _mm_unpacklo_epi16(a0, a1);
 }
 
-static INLINE void transpose_8bit_8x8(const __m128i *const in,
+static inline void transpose_8bit_8x8(const __m128i *const in,
                                       __m128i *const out) {
   // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03 04 05 06 07
@@ -93,7 +93,7 @@ static INLINE void transpose_8bit_8x8(const __m128i *const in,
   out[7] = _mm_unpackhi_epi64(c3, c3);
 }
 
-static INLINE void transpose_16bit_4x4(const __m128i *const in,
+static inline void transpose_16bit_4x4(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
   // in[0]: 00 01 02 03  XX XX XX XX
@@ -121,7 +121,7 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in,
   out[3] = _mm_srli_si128(out[2], 8);
 }
 
-static INLINE void transpose_16bit_4x8(const __m128i *const in,
+static inline void transpose_16bit_4x8(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
   // in[0]: 00 01 02 03  XX XX XX XX
@@ -163,7 +163,7 @@ static INLINE void transpose_16bit_4x8(const __m128i *const in,
   out[3] = _mm_unpackhi_epi64(b2, b3);
 }
 
-static INLINE void transpose_16bit_8x4(const __m128i *const in,
+static inline void transpose_16bit_8x4(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
   // in[0]: 00 01 02 03  04 05 06 07
@@ -211,7 +211,7 @@ static INLINE void transpose_16bit_8x4(const __m128i *const in,
   out[7] = _mm_unpackhi_epi64(b6, zeros);
 }
 
-static INLINE void transpose_16bit_8x8(const __m128i *const in,
+static inline void transpose_16bit_8x8(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
   // in[0]: 00 01 02 03  04 05 06 07
@@ -278,7 +278,7 @@ static INLINE void transpose_16bit_8x8(const __m128i *const in,
 }
 
 // Transpose in-place
-static INLINE void transpose_16bit_16x16(__m128i *const left,
+static inline void transpose_16bit_16x16(__m128i *const left,
                                          __m128i *const right) {
   __m128i tbuf[8];
   transpose_16bit_8x8(left, left);
@@ -296,7 +296,7 @@ static INLINE void transpose_16bit_16x16(__m128i *const left,
   left[15] = tbuf[7];
 }
 
-static INLINE void transpose_32bit_4x4(const __m128i *const in,
+static inline void transpose_32bit_4x4(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 32 bit elements. Goes from:
   // in[0]: 00 01 02 03
@@ -325,7 +325,7 @@ static INLINE void transpose_32bit_4x4(const __m128i *const in,
   out[3] = _mm_unpackhi_epi64(a2, a3);
 }
 
-static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+static inline void transpose_32bit_4x4x2(const __m128i *const in,
                                          __m128i *const out) {
   // Unpack 32 bit elements. Goes from:
   // in[0]: 00 01 02 03
@@ -373,7 +373,7 @@ static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
   out[7] = _mm_unpackhi_epi64(a6, a7);
 }
 
-static INLINE void transpose_32bit_8x4(const __m128i *const in,
+static inline void transpose_32bit_8x4(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 32 bit elements. Goes from:
   // in[0]: 00 01 02 03
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index d4c1bc54ec..5ad2dd2299 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -20,12 +20,12 @@
 extern "C" {
 #endif
 
-static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+static inline __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
   return _mm256_set1_epi32(
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
 
-static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+static inline void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
                                    __m256i *in0, __m256i *in1, const __m256i _r,
                                    const int32_t cos_bit) {
   __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
@@ -49,21 +49,21 @@ static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
   *in1 = _mm256_packs_epi32(d0, d1);
 }
 
-static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+static inline void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
   const __m256i _in0 = *in0;
   const __m256i _in1 = *in1;
   *in0 = _mm256_adds_epi16(_in0, _in1);
   *in1 = _mm256_subs_epi16(_in0, _in1);
 }
 
-static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+static inline void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
   const __m256i _in0 = *in0;
   const __m256i _in1 = *in1;
   *in0 = _mm256_add_epi32(_in0, _in1);
   *in1 = _mm256_sub_epi32(_in0, _in1);
 }
 
-static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+static inline void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
                                              __m256i in0, __m256i in1) {
   const __m256i _in0 = in0;
   const __m256i _in1 = in1;
@@ -71,7 +71,7 @@ static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
   *out1 = _mm256_subs_epi16(_in0, _in1);
 }
 
-static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+static inline void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
                                            __m256i in0, __m256i in1) {
   const __m256i _in0 = in0;
   const __m256i _in1 = in1;
@@ -79,11 +79,11 @@ static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
   *out1 = _mm256_sub_epi32(_in0, _in1);
 }
 
-static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+static inline __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
   return _mm256_load_si256((const __m256i *)a);
 }
 
-static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+static inline void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
                                                    int stride, __m256i *out,
                                                    int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -91,7 +91,7 @@ static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+static inline void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
                                                         int stride,
                                                         __m256i *out,
                                                         int out_size) {
@@ -100,13 +100,13 @@ static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
   }
 }
 
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+static inline __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
   const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
   const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
   return _mm256_permute4x64_epi64(b, 0xD8);
 }
 
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+static inline void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
                                                        int stride, __m256i *out,
                                                        int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -114,7 +114,7 @@ static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
   }
 }
 
-static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+static inline void transpose2_8x8_avx2(const __m256i *const in,
                                        __m256i *const out) {
   __m256i t[16], u[16];
   // (1st, 2nd) ==> (lo, hi)
@@ -154,7 +154,7 @@ static INLINE void transpose2_8x8_avx2(const __m256i *const in,
   }
 }
 
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+static inline void transpose_16bit_16x16_avx2(const __m256i *const in,
                                               __m256i *const out) {
   __m256i t[16];
 
@@ -195,7 +195,7 @@ static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
   transpose2_8x8_avx2(&t[8], &out[8]);
 }
 
-static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+static inline void transpose_16bit_16x8_avx2(const __m256i *const in,
                                              __m256i *const out) {
   const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
   const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
@@ -225,13 +225,13 @@ static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
   out[7] = _mm256_unpackhi_epi64(b6, b7);
 }
 
-static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+static inline void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
   }
 }
 
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+static inline void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
   if (bit < 0) {
     bit = -bit;
     __m256i round = _mm256_set1_epi16(1 << (bit - 1));
@@ -246,14 +246,14 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
   }
 }
 
-static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
+static inline __m256i round_shift_32_avx2(__m256i vec, int bit) {
   __m256i tmp, round;
   round = _mm256_set1_epi32(1 << (bit - 1));
   tmp = _mm256_add_epi32(vec, round);
   return _mm256_srai_epi32(tmp, bit);
 }
 
-static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+static inline void round_shift_array_32_avx2(__m256i *input, __m256i *output,
                                              const int size, const int bit) {
   if (bit > 0) {
     int i;
@@ -268,7 +268,7 @@ static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
   }
 }
 
-static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+static inline void round_shift_rect_array_32_avx2(__m256i *input,
                                                   __m256i *output,
                                                   const int size, const int bit,
                                                   const int val) {
@@ -290,14 +290,14 @@ static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
   }
 }
 
-static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+static inline __m256i scale_round_avx2(const __m256i a, const int scale) {
   const __m256i scale_rounding =
       pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
   const __m256i b = _mm256_madd_epi16(a, scale_rounding);
   return _mm256_srai_epi32(b, NewSqrt2Bits);
 }
 
-static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+static inline void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
                                                      int32_t *const b) {
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
@@ -310,7 +310,7 @@ static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
   _mm256_store_si256((__m256i *)(b + 64), temp);
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+static inline void store_rect_buffer_16bit_to_32bit_w8_avx2(
     const __m256i *const in, int32_t *const out, const int stride,
     const int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -318,7 +318,7 @@ static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
   }
 }
 
-static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+static inline void pack_reg(const __m128i *in1, const __m128i *in2,
                             __m256i *out) {
   out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
   out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
@@ -330,7 +330,7 @@ static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
   out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
 }
 
-static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+static inline void extract_reg(const __m256i *in, __m128i *out1) {
   out1[0] = _mm256_castsi256_si128(in[0]);
   out1[1] = _mm256_castsi256_si128(in[1]);
   out1[2] = _mm256_castsi256_si128(in[2]);
diff --git a/aom_dsp/x86/txfm_common_sse2.h b/aom_dsp/x86/txfm_common_sse2.h
index 96276f470b..b8bcf8731e 100644
--- a/aom_dsp/x86/txfm_common_sse2.h
+++ b/aom_dsp/x86/txfm_common_sse2.h
@@ -20,7 +20,7 @@
   _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
 
 // Reverse the 8 16 bit words in __m128i
-static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+static inline __m128i mm_reverse_epi16(const __m128i x) {
   const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
   const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
   return _mm_shuffle_epi32(b, 0x4e);
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index b470135d99..59d170c7bd 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -16,17 +16,17 @@
 #include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+static inline __m128i mm256_add_hi_lo_epi16(const __m256i val) {
   return _mm_add_epi16(_mm256_castsi256_si128(val),
                        _mm256_extractf128_si256(val, 1));
 }
 
-static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+static inline __m128i mm256_add_hi_lo_epi32(const __m256i val) {
   return _mm_add_epi32(_mm256_castsi256_si128(val),
                        _mm256_extractf128_si256(val, 1));
 }
 
-static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+static inline void variance_kernel_avx2(const __m256i src, const __m256i ref,
                                         __m256i *const sse,
                                         __m256i *const sum) {
   const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
@@ -46,7 +46,7 @@ static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
   *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
 }
 
-static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+static inline int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
                                                      unsigned int *const sse) {
   // extract the low lane and add it to the high lane
   const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
@@ -63,7 +63,7 @@ static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
 }
 
 // handle pixels (<= 512)
-static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+static inline int variance_final_512_avx2(__m256i vsse, __m256i vsum,
                                           unsigned int *const sse) {
   // extract the low lane and add it to the high lane
   const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
@@ -73,7 +73,7 @@ static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
 }
 
 // handle 1024 pixels (32x32, 16x64, 64x16)
-static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+static inline int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
                                            unsigned int *const sse) {
   // extract the low lane and add it to the high lane
   const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
@@ -83,7 +83,7 @@ static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
   return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
 }
 
-static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+static inline __m256i sum_to_32bit_avx2(const __m256i sum) {
   const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
   const __m256i sum_hi =
       _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
@@ -91,14 +91,14 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
 }
 
 // handle 2048 pixels (32x64, 64x32)
-static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+static inline int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
                                            unsigned int *const sse) {
   vsum = sum_to_32bit_avx2(vsum);
   const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
   return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
 }
 
-static INLINE void variance16_kernel_avx2(
+static inline void variance16_kernel_avx2(
     const uint8_t *const src, const int src_stride, const uint8_t *const ref,
     const int ref_stride, __m256i *const sse, __m256i *const sum) {
   const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
@@ -110,7 +110,7 @@ static INLINE void variance16_kernel_avx2(
   variance_kernel_avx2(s, r, sse, sum);
 }
 
-static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+static inline void variance32_kernel_avx2(const uint8_t *const src,
                                           const uint8_t *const ref,
                                           __m256i *const sse,
                                           __m256i *const sum) {
@@ -119,7 +119,7 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src,
   variance_kernel_avx2(s, r, sse, sum);
 }
 
-static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+static inline void variance16_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
                                    __m256i *const vsum) {
@@ -132,7 +132,7 @@ static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+static inline void variance32_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
                                    __m256i *const vsum) {
@@ -145,7 +145,7 @@ static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+static inline void variance64_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
                                    __m256i *const vsum) {
@@ -159,7 +159,7 @@ static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+static inline void variance128_avx2(const uint8_t *src, const int src_stride,
                                     const uint8_t *ref, const int ref_stride,
                                     const int h, __m256i *const vsse,
                                     __m256i *const vsum) {
@@ -234,21 +234,21 @@ unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+static inline __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   const __m256i d =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
   return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+static inline __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
   const __m256i d =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
   return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+static inline void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                             const __m256i a,
                                             uint8_t *comp_pred) {
   const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
@@ -411,7 +411,7 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+static inline __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
                                                       const __m256i s1,
                                                       const __m256i a) {
   const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
@@ -807,7 +807,7 @@ uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   }
 }
 
-static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
+static inline void calc_sum_sse_wd32_avx2(const uint8_t *src,
                                           const uint8_t *ref,
                                           __m256i set_one_minusone,
                                           __m256i sse_8x16[2],
@@ -827,7 +827,7 @@ static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
   sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
 }
 
-static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
+static inline __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
                                          unsigned int *tot_sse, int *tot_sum) {
   // s00 s01 s10 s11 s20 s21 s30 s31
   const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
@@ -862,7 +862,7 @@ static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
   return sum_sse_order_add;
 }
 
-static INLINE void get_var_sse_sum_8x8_quad_avx2(
+static inline void get_var_sse_sum_8x8_quad_avx2(
     const uint8_t *src, int src_stride, const uint8_t *ref,
     const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
     unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
@@ -901,7 +901,7 @@ static INLINE void get_var_sse_sum_8x8_quad_avx2(
   _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
 }
 
-static INLINE void get_var_sse_sum_16x16_dual_avx2(
+static inline void get_var_sse_sum_16x16_dual_avx2(
     const uint8_t *src, int src_stride, const uint8_t *ref,
     const int ref_stride, const int h, uint32_t *sse16x16,
     unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 25f240abca..9d6a238a2f 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -35,38 +35,38 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   return (unsigned int)_mm_cvtsi128_si32(vsum);
 }
 
-static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+static inline __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
   const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
   const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
-static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+static inline __m128i load8_8to16_sse2(const uint8_t *const p) {
   const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
   return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
 }
 
-static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
+static inline void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
   const __m128i p0 = _mm_loadu_si128((const __m128i *)p);
   out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128());  // lower 8 values
   out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128());  // upper 8 values
 }
 
 // Accumulate 4 32bit numbers in val to 1 32bit number
-static INLINE unsigned int add32x4_sse2(__m128i val) {
+static inline unsigned int add32x4_sse2(__m128i val) {
   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
   return (unsigned int)_mm_cvtsi128_si32(val);
 }
 
 // Accumulate 8 16bit in sum to 4 32bit number
-static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+static inline __m128i sum_to_32bit_sse2(const __m128i sum) {
   const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
   const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
   return _mm_add_epi32(sum_lo, sum_hi);
 }
 
-static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+static inline void variance_kernel_sse2(const __m128i src, const __m128i ref,
                                         __m128i *const sse,
                                         __m128i *const sum) {
   const __m128i diff = _mm_sub_epi16(src, ref);
@@ -77,7 +77,7 @@ static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
 // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
 // Slightly faster than variance_final_256_pel_sse2()
 // diff sum of 128 pixels can still fit in 16bit integer
-static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+static inline void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
                                                unsigned int *const sse,
                                                int *const sum) {
   *sse = add32x4_sse2(vsse);
@@ -89,7 +89,7 @@ static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
 }
 
 // Can handle 256 pixels' diff sum (such as 16x16)
-static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+static inline void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
                                                unsigned int *const sse,
                                                int *const sum) {
   *sse = add32x4_sse2(vsse);
@@ -101,7 +101,7 @@ static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
 }
 
 // Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
-static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+static inline void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
                                                unsigned int *const sse,
                                                int *const sum) {
   *sse = add32x4_sse2(vsse);
@@ -113,7 +113,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
 }
 
 // Can handle 1024 pixels' diff sum (such as 32x32)
-static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+static inline void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
                                                 unsigned int *const sse,
                                                 int *const sum) {
   *sse = add32x4_sse2(vsse);
@@ -122,7 +122,7 @@ static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
   *sum = (int)add32x4_sse2(vsum);
 }
 
-static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+static inline void variance4_sse2(const uint8_t *src, const int src_stride,
                                   const uint8_t *ref, const int ref_stride,
                                   const int h, __m128i *const sse,
                                   __m128i *const sum) {
@@ -139,7 +139,7 @@ static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+static inline void variance8_sse2(const uint8_t *src, const int src_stride,
                                   const uint8_t *ref, const int ref_stride,
                                   const int h, __m128i *const sse,
                                   __m128i *const sum) {
@@ -156,7 +156,7 @@ static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+static inline void variance16_kernel_sse2(const uint8_t *const src,
                                           const uint8_t *const ref,
                                           __m128i *const sse,
                                           __m128i *const sum) {
@@ -172,7 +172,7 @@ static INLINE void variance16_kernel_sse2(const uint8_t *const src,
   variance_kernel_sse2(src1, ref1, sse, sum);
 }
 
-static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+static inline void variance16_sse2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m128i *const sse,
                                    __m128i *const sum) {
@@ -186,7 +186,7 @@ static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+static inline void variance32_sse2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m128i *const sse,
                                    __m128i *const sum) {
@@ -202,7 +202,7 @@ static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+static inline void variance64_sse2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m128i *const sse,
                                    __m128i *const sum) {
@@ -219,7 +219,7 @@ static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
-static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+static inline void variance128_sse2(const uint8_t *src, const int src_stride,
                                     const uint8_t *ref, const int ref_stride,
                                     const int h, __m128i *const sse,
                                     __m128i *const sum) {
@@ -404,7 +404,7 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+static inline __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
   const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
diff --git a/aom_mem/aom_mem.h b/aom_mem/aom_mem.h
index cab6afe894..c061ef71b4 100644
--- a/aom_mem/aom_mem.h
+++ b/aom_mem/aom_mem.h
@@ -37,7 +37,7 @@ void *aom_malloc(size_t size);
 void *aom_calloc(size_t num, size_t size);
 void aom_free(void *memblk);
 
-static INLINE void *aom_memset16(void *dest, int val, size_t length) {
+static inline void *aom_memset16(void *dest, int val, size_t length) {
   size_t i;
   uint16_t *dest16 = (uint16_t *)dest;
   for (i = 0; i < length; i++) *dest16++ = val;
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c
index 47b4135fb3..cfab17b77c 100644
--- a/aom_ports/aarch64_cpudetect.c
+++ b/aom_ports/aarch64_cpudetect.c
@@ -36,7 +36,7 @@ static int arm_get_cpu_caps(void) {
 
 // sysctlbyname() parameter documentation for instruction set characteristics:
 // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
-static INLINE bool have_feature(const char *feature) {
+static inline bool have_feature(const char *feature) {
   int64_t feature_present = 0;
   size_t size = sizeof(feature_present);
   if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
diff --git a/aom_ports/aom_timer.h b/aom_ports/aom_timer.h
index a521af2038..6b4c9850af 100644
--- a/aom_ports/aom_timer.h
+++ b/aom_ports/aom_timer.h
@@ -56,7 +56,7 @@ struct aom_usec_timer {
 #endif
 };
 
-static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) {
+static inline void aom_usec_timer_start(struct aom_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->begin);
 #else
@@ -64,7 +64,7 @@ static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) {
 #endif
 }
 
-static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) {
+static inline void aom_usec_timer_mark(struct aom_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->end);
 #else
@@ -72,7 +72,7 @@ static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) {
 #endif
 }
 
-static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+static inline int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
 #if defined(_WIN32)
   LARGE_INTEGER freq, diff;
 
@@ -99,11 +99,11 @@ struct aom_usec_timer {
   void *dummy;
 };
 
-static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; }
+static inline void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; }
 
-static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; }
+static inline void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; }
 
-static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+static inline int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
   (void)t;
   return 0;
 }
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 9a5d6684f6..cac529eb27 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -39,21 +39,21 @@ extern "C" {
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static INLINE int get_msb(unsigned int n) {
+static inline int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
 #pragma intrinsic(_BitScanReverse)
 
-static INLINE int get_msb(unsigned int n) {
+static inline int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
 #else
-static INLINE int get_msb(unsigned int n) {
+static inline int get_msb(unsigned int n) {
   int log = 0;
   unsigned int value = n;
 
@@ -72,13 +72,13 @@ static INLINE int get_msb(unsigned int n) {
 
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); }
+static inline int aom_clzll(uint64_t n) { return __builtin_clzll(n); }
 #elif defined(USE_MSC_INTRINSICS)
 #if defined(_M_X64) || defined(_M_ARM64)
 #pragma intrinsic(_BitScanReverse64)
 #endif
 
-static INLINE int aom_clzll(uint64_t n) {
+static inline int aom_clzll(uint64_t n) {
   assert(n != 0);
   unsigned long first_set_bit;  // NOLINT(runtime/int)
 #if defined(_M_X64) || defined(_M_ARM64)
@@ -101,7 +101,7 @@ static INLINE int aom_clzll(uint64_t n) {
 }
 #undef USE_MSC_INTRINSICS
 #else
-static INLINE int aom_clzll(uint64_t n) {
+static inline int aom_clzll(uint64_t n) {
   assert(n != 0);
 
   int res = 0;
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index 7a45b4e2c2..1d2acefc59 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -118,7 +118,7 @@ typedef enum {
 
 // NaCl has no support for xgetbv or the raw opcode.
 #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
-static INLINE uint64_t xgetbv(void) {
+static inline uint64_t xgetbv(void) {
   const uint32_t ecx = 0;
   uint32_t eax, edx;
   // Use the raw opcode for xgetbv for compatibility with older toolchains.
@@ -132,7 +132,7 @@ static INLINE uint64_t xgetbv(void) {
 #include <immintrin.h>
 #define xgetbv() _xgetbv(0)
 #elif defined(_MSC_VER) && defined(_M_IX86)
-static INLINE uint64_t xgetbv(void) {
+static inline uint64_t xgetbv(void) {
   uint32_t eax_, edx_;
   __asm {
     xor ecx, ecx  // ecx = 0
@@ -171,7 +171,7 @@ static INLINE uint64_t xgetbv(void) {
 #define BIT(n) (1u << (n))
 #endif
 
-static INLINE int x86_simd_caps(void) {
+static inline int x86_simd_caps(void) {
   unsigned int flags = 0;
   unsigned int mask = ~0u;
   unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
@@ -248,7 +248,7 @@ static INLINE int x86_simd_caps(void) {
 // If you are timing a large function (CPU time > a couple of seconds), use
 // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
 // out-of-order leakage that can occur is minimal compared to total runtime.
-static INLINE unsigned int x86_readtsc(void) {
+static inline unsigned int x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tsc;
   __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
@@ -266,7 +266,7 @@ static INLINE unsigned int x86_readtsc(void) {
 #endif
 }
 // 64-bit CPU cycle counter
-static INLINE uint64_t x86_readtsc64(void) {
+static inline uint64_t x86_readtsc64(void) {
 #if defined(__GNUC__) && __GNUC__
   uint32_t hi, lo;
   __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
@@ -285,7 +285,7 @@ static INLINE uint64_t x86_readtsc64(void) {
 }
 
 // 32-bit CPU cycle counter with a partial fence against out-of-order execution.
-static INLINE unsigned int x86_readtscp(void) {
+static inline unsigned int x86_readtscp(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tscp;
   __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
@@ -306,7 +306,7 @@ static INLINE unsigned int x86_readtscp(void) {
 #endif
 }
 
-static INLINE unsigned int x86_tsc_start(void) {
+static inline unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
   // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
@@ -318,7 +318,7 @@ static INLINE unsigned int x86_tsc_start(void) {
   return x86_readtsc();
 }
 
-static INLINE unsigned int x86_tsc_end(void) {
+static inline unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
   // This call should not be removed. See function notes above.
@@ -378,7 +378,7 @@ static unsigned short x87_get_control_word(void) {
 }
 #endif
 
-static INLINE unsigned int x87_set_double_precision(void) {
+static inline unsigned int x87_set_double_precision(void) {
   unsigned int mode = x87_get_control_word();
   // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
   // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
diff --git a/aom_util/aom_pthread.h b/aom_util/aom_pthread.h
index 2021f13f5c..89c0c96fc9 100644
--- a/aom_util/aom_pthread.h
+++ b/aom_util/aom_pthread.h
@@ -62,31 +62,31 @@ typedef CONDITION_VARIABLE pthread_cond_t;
 #endif
 #define THREAD_EXIT_SUCCESS 0
 
-static INLINE int pthread_attr_init(pthread_attr_t *attr) {
+static inline int pthread_attr_init(pthread_attr_t *attr) {
   (void)attr;
   return 0;
 }
 
-static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
+static inline int pthread_attr_destroy(pthread_attr_t *attr) {
   (void)attr;
   return 0;
 }
 
-static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
+static inline int pthread_attr_getstacksize(const pthread_attr_t *attr,
                                             size_t *stacksize) {
   (void)attr;
   (void)stacksize;
   return EINVAL;
 }
 
-static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
+static inline int pthread_attr_setstacksize(pthread_attr_t *attr,
                                             size_t stacksize) {
   (void)attr;
   (void)stacksize;
   return EINVAL;
 }
 
-static INLINE int pthread_create(pthread_t *const thread,
+static inline int pthread_create(pthread_t *const thread,
                                  const pthread_attr_t *attr,
                                  unsigned int(__stdcall *start)(void *),
                                  void *arg) {
@@ -107,7 +107,7 @@ static INLINE int pthread_create(pthread_t *const thread,
   return 0;
 }
 
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+static inline int pthread_join(pthread_t thread, void **value_ptr) {
   (void)value_ptr;
   return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
               WAIT_OBJECT_0 ||
@@ -115,56 +115,56 @@ static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
 }
 
 // Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+static inline int pthread_mutex_init(pthread_mutex_t *const mutex,
                                      void *mutexattr) {
   (void)mutexattr;
   InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
   return 0;
 }
 
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+static inline int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
   return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
 }
 
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) {
   EnterCriticalSection(mutex);
   return 0;
 }
 
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
   LeaveCriticalSection(mutex);
   return 0;
 }
 
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
   DeleteCriticalSection(mutex);
   return 0;
 }
 
 // Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+static inline int pthread_cond_destroy(pthread_cond_t *const condition) {
   (void)condition;
   return 0;
 }
 
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+static inline int pthread_cond_init(pthread_cond_t *const condition,
                                     void *cond_attr) {
   (void)cond_attr;
   InitializeConditionVariable(condition);
   return 0;
 }
 
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+static inline int pthread_cond_signal(pthread_cond_t *const condition) {
   WakeConditionVariable(condition);
   return 0;
 }
 
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+static inline int pthread_cond_broadcast(pthread_cond_t *const condition) {
   WakeAllConditionVariable(condition);
   return 0;
 }
 
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+static inline int pthread_cond_wait(pthread_cond_t *const condition,
                                     pthread_mutex_t *const mutex) {
   int ok;
   ok = SleepConditionVariableCS(condition, mutex, INFINITE);
diff --git a/aom_util/endian_inl.h b/aom_util/endian_inl.h
index 17090cab01..b16f956fc3 100644
--- a/aom_util/endian_inl.h
+++ b/aom_util/endian_inl.h
@@ -64,7 +64,7 @@
 #define HAVE_BUILTIN_BSWAP64
 #endif
 
-static INLINE uint16_t BSwap16(uint16_t x) {
+static inline uint16_t BSwap16(uint16_t x) {
 #if defined(HAVE_BUILTIN_BSWAP16)
   return __builtin_bswap16(x);
 #elif defined(_MSC_VER)
@@ -75,7 +75,7 @@ static INLINE uint16_t BSwap16(uint16_t x) {
 #endif  // HAVE_BUILTIN_BSWAP16
 }
 
-static INLINE uint32_t BSwap32(uint32_t x) {
+static inline uint32_t BSwap32(uint32_t x) {
 #if defined(HAVE_BUILTIN_BSWAP32)
   return __builtin_bswap32(x);
 #elif defined(__i386__) || defined(__x86_64__)
@@ -89,7 +89,7 @@ static INLINE uint32_t BSwap32(uint32_t x) {
 #endif  // HAVE_BUILTIN_BSWAP32
 }
 
-static INLINE uint64_t BSwap64(uint64_t x) {
+static inline uint64_t BSwap64(uint64_t x) {
 #if defined(HAVE_BUILTIN_BSWAP64)
   return __builtin_bswap64(x);
 #elif defined(__x86_64__)
diff --git a/apps/aomdec.c b/apps/aomdec.c
index 144cacac7e..59fed36a5a 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c
@@ -118,7 +118,7 @@ static const arg_def_t *all_args[] = {
 
 #if CONFIG_LIBYUV
 // Returns 0 on success and returns -1 on failure.
-static INLINE int libyuv_scale(const aom_image_t *src, aom_image_t *dst,
+static inline int libyuv_scale(const aom_image_t *src, aom_image_t *dst,
                                FilterModeEnum mode) {
   if (src->fmt != dst->fmt) {
     fprintf(stderr,
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index ce6bbe9618..4471bfee47 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -561,7 +561,7 @@ struct aom_codec_alg_priv {
   bool monochrome_on_init;
 };
 
-static INLINE int gcd(int64_t a, int b) {
+static inline int gcd(int64_t a, int b) {
   int remainder;
   while (b > 0) {
     remainder = (int)(a % b);
@@ -3017,7 +3017,7 @@ static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
   return flags;
 }
 
-static INLINE int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) {
+static inline int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) {
   if (cpi->oxcf.mode != REALTIME || av1_is_resize_needed(&cpi->oxcf))
     return cpi->oxcf.border_in_pixels;
 
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 77ec4c5da6..77769bd1d5 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -506,7 +506,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
   return AOM_CODEC_OK;
 }
 
-static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+static inline void check_resync(aom_codec_alg_priv_t *const ctx,
                                 const AV1Decoder *const pbi) {
   // Clear resync flag if worker got a key frame or intra only frame.
   if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index e8a565d535..2a28c301f8 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -60,7 +60,7 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
   pool->num_frame_bufs = 0;
 }
 
-static INLINE void free_cdef_linebuf_conditional(
+static inline void free_cdef_linebuf_conditional(
     AV1_COMMON *const cm, const size_t *new_linebuf_size) {
   CdefInfo *cdef_info = &cm->cdef_info;
   for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
@@ -71,7 +71,7 @@ static INLINE void free_cdef_linebuf_conditional(
   }
 }
 
-static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
+static inline void free_cdef_bufs_conditional(AV1_COMMON *const cm,
                                               uint16_t **colbuf,
                                               uint16_t **srcbuf,
                                               const size_t *new_colbuf_size,
@@ -89,7 +89,7 @@ static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
   }
 }
 
-static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
+static inline void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
   aom_free(*srcbuf);
   *srcbuf = NULL;
   for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
@@ -98,7 +98,7 @@ static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
   }
 }
 
-static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
+static inline void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
                                       const int num_mi_rows) {
   if (*cdef_row_mt == NULL) return;
 #if CONFIG_MULTITHREAD
@@ -145,7 +145,7 @@ void av1_free_cdef_buffers(AV1_COMMON *const cm,
   }
 }
 
-static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
+static inline void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
                                       const int num_planes) {
   CdefInfo *cdef_info = &cm->cdef_info;
   for (int plane = 0; plane < num_planes; plane++) {
@@ -155,7 +155,7 @@ static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
   }
 }
 
-static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
+static inline void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
                                    uint16_t **srcbuf, const int num_planes) {
   CdefInfo *cdef_info = &cm->cdef_info;
   if (*srcbuf == NULL)
@@ -169,7 +169,7 @@ static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
   }
 }
 
-static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm,
+static inline void alloc_cdef_row_sync(AV1_COMMON *const cm,
                                        AV1CdefRowSync **cdef_row_mt,
                                        const int num_mi_rows) {
   if (*cdef_row_mt != NULL) return;
diff --git a/av1/common/arm/av1_convolve_horiz_rs_neon.c b/av1/common/arm/av1_convolve_horiz_rs_neon.c
index 53353927a3..0ef5f9d342 100644
--- a/av1/common/arm/av1_convolve_horiz_rs_neon.c
+++ b/av1/common/arm/av1_convolve_horiz_rs_neon.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
 
-static INLINE uint8x8_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+static inline uint8x8_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
@@ -40,7 +40,7 @@ static INLINE uint8x8_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS);
 }
 
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index 2ba7e86874..6822ae79b6 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -24,7 +24,7 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x4_t s6, const int16x4_t s7,
@@ -46,7 +46,7 @@ static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
+static inline int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t s6, const int16x8_t s7,
@@ -68,7 +68,7 @@ static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_horiz_scale_8tap_neon(const uint8_t *src,
+static inline void convolve_horiz_scale_8tap_neon(const uint8_t *src,
                                                   int src_stride, int16_t *dst,
                                                   int dst_stride, int w, int h,
                                                   const int16_t *x_filter,
@@ -191,7 +191,7 @@ static INLINE void convolve_horiz_scale_8tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x8_t filter,
@@ -211,7 +211,7 @@ static INLINE int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1,
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1,
+static inline int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t filter,
@@ -232,7 +232,7 @@ static INLINE int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_horiz_scale_6tap_neon(const uint8_t *src,
+static inline void convolve_horiz_scale_6tap_neon(const uint8_t *src,
                                                   int src_stride, int16_t *dst,
                                                   int dst_stride, int w, int h,
                                                   const int16_t *x_filter,
@@ -351,7 +351,7 @@ static INLINE void convolve_horiz_scale_6tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE void convolve_horiz_scale_2_8tap_neon(
+static inline void convolve_horiz_scale_2_8tap_neon(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter) {
   const int bd = 8;
@@ -492,7 +492,7 @@ static INLINE void convolve_horiz_scale_2_8tap_neon(
   }
 }
 
-static INLINE void convolve_horiz_scale_2_6tap_neon(
+static inline void convolve_horiz_scale_2_6tap_neon(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter) {
   const int bd = 8;
diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
index a66512af96..1551d3aec7 100644
--- a/av1/common/arm/av1_convolve_scale_neon_dotprod.c
+++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -34,7 +34,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
 };
 // clang-format on
 
-static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
+static inline int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const int8x8_t filter,
                                       const int32x4_t horiz_const) {
@@ -56,7 +56,7 @@ static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
+static inline int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const uint8x8_t s4, const uint8x8_t s5,
                                       const uint8x8_t s6, const uint8x8_t s7,
@@ -88,7 +88,7 @@ static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_horiz_scale_neon_dotprod(
+static inline void convolve_horiz_scale_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter, const int subpel_x_qn,
     const int x_step_qn) {
@@ -188,7 +188,7 @@ static INLINE void convolve_horiz_scale_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
+static inline int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
                                               const int8x8_t filters,
                                               const int32x4_t horiz_const,
                                               const uint8x16x2_t permute_tbl) {
@@ -209,7 +209,7 @@ static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
+static inline int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
                                               const int8x8_t filters,
                                               const int32x4_t horiz_const,
                                               const uint8x16x2_t permute_tbl) {
@@ -239,7 +239,7 @@ static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_horiz_scale_2_neon_dotprod(
+static inline void convolve_horiz_scale_2_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter) {
   const int bd = 8;
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
index 7970b3645a..eb5092b3af 100644
--- a/av1/common/arm/av1_convolve_scale_neon_i8mm.c
+++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -34,7 +34,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
 };
 // clang-format on
 
-static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
+static inline int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const int8x8_t filter,
                                       const int32x4_t horiz_const) {
@@ -52,7 +52,7 @@ static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
+static inline int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
                                       const uint8x8_t s2, const uint8x8_t s3,
                                       const uint8x8_t s4, const uint8x8_t s5,
                                       const uint8x8_t s6, const uint8x8_t s7,
@@ -78,7 +78,7 @@ static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_horiz_scale_neon_i8mm(const uint8_t *src,
+static inline void convolve_horiz_scale_neon_i8mm(const uint8_t *src,
                                                   int src_stride, int16_t *dst,
                                                   int dst_stride, int w, int h,
                                                   const int16_t *x_filter,
@@ -175,7 +175,7 @@ static INLINE void convolve_horiz_scale_neon_i8mm(const uint8_t *src,
   }
 }
 
-static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
+static inline int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
                                               const int8x8_t filters,
                                               const int32x4_t horiz_const,
                                               const uint8x16x2_t permute_tbl) {
@@ -192,7 +192,7 @@ static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
+static inline int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
                                               const int8x8_t filters,
                                               const int32x4_t horiz_const,
                                               const uint8x16x2_t permute_tbl) {
@@ -219,7 +219,7 @@ static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2],
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_horiz_scale_2_neon_i8mm(
+static inline void convolve_horiz_scale_2_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter) {
   const int bd = 8;
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 8188a06e17..5a30834f7d 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -55,7 +55,7 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
   { av1_idct64, NULL, NULL },
 };
 
-static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+static inline void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
                                                   uint8_t *output, int stride,
                                                   int flipud,
                                                   const int height) {
@@ -70,7 +70,7 @@ static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
   }
 }
 
-static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+static inline uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
                                                     int16x8_t res0,
                                                     int16x8_t res1) {
   int16x8_t temp_output[2];
@@ -84,7 +84,7 @@ static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
   return temp_output_8q;
 }
 
-static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+static inline void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
                                                    uint8_t *output, int stride,
                                                    int flipud, int height) {
   uint8x16_t temp_output_8q;
@@ -98,14 +98,14 @@ static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+static inline void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
                                                 int value) {
   for (int i = 0; i < size; i++) {
     a[i] = vdupq_n_s16((int16_t)value);
   }
 }
 
-static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+static inline void btf_16_lane_0_1_neon(const int16x8_t in0,
                                         const int16x8_t in1, const int16x4_t c,
                                         int16x8_t *t0, int16x8_t *t1) {
   int32x4_t s0[2], s1[2];
@@ -130,7 +130,7 @@ static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
   *t1 = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+static inline void btf_16_lane_1_0_neon(const int16x8_t in0,
                                         const int16x8_t in1, const int16x4_t c,
                                         int16x8_t *t0, int16x8_t *t1) {
   int32x4_t s0[2], s1[2];
@@ -155,7 +155,7 @@ static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
   *t1 = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+static inline void btf_16_lane_2_3_neon(const int16x8_t in0,
                                         const int16x8_t in1, const int16x4_t c,
                                         int16x8_t *t0, int16x8_t *t1) {
   int32x4_t s0[2], s1[2];
@@ -180,7 +180,7 @@ static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
   *t1 = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+static inline void btf_16_neon(const int16x8_t in0, int16_t coef1,
                                int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
   int32x4_t s0_l, s0_h, s1_l, s1_h;
   int16x4_t v0[2], v1[2];
@@ -199,7 +199,7 @@ static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
   *t1 = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+static inline void btf_16_lane_3_2_neon(const int16x8_t in0,
                                         const int16x8_t in1, const int16x4_t c,
                                         int16x8_t *t0, int16x8_t *t1) {
   int32x4_t s0[2], s1[2];
@@ -224,7 +224,7 @@ static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
   *t1 = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+static inline void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
   int32x4_t t0[2], t1[2];
   int16x4_t v0[2], v1[2];
 
@@ -248,7 +248,7 @@ static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
   x[1] = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+static inline int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
                                        const int16_t c2, const int16_t c3) {
   int16x4_t val = vdup_n_s16(c0);
   val = vset_lane_s16(c1, val, 1);
@@ -257,7 +257,7 @@ static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
   return val;
 }
 
-static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+static inline void iadst8_neon(int16x8_t *const in, int16x8_t *out,
                                int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
@@ -330,7 +330,7 @@ static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
   out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+static inline void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
@@ -382,7 +382,7 @@ static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
   out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+static inline void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[8], step2[8];
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
@@ -420,7 +420,7 @@ static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   out[7] = vqsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -456,7 +456,7 @@ static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
   }
 }
 
-static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+static inline void flip_buf_ud_neon(int16x8_t *input, int size) {
   int16x8_t temp[8];
   for (int i = 0; i < size; ++i) {
     temp[i] = input[size - 1 - i];
@@ -466,7 +466,7 @@ static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
   }
 }
 
-static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+static inline void load_buffer_32bit_to_16bit_neon(const int32_t *input,
                                                    int stride,
                                                    int16x8_t *const a,
                                                    int out_size) {
@@ -480,7 +480,7 @@ static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
 static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
                                          4 * 5793 };
 
-static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+static inline void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
                                             int txw_idx, int8_t size, int bit) {
   const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
   int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
@@ -497,7 +497,7 @@ static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
   }
 }
 
-static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+static inline void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
                                         int size) {
   int32x4_t out_low, out_high;
   int16x4_t low, high;
@@ -513,7 +513,7 @@ static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
   }
 }
 
-static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -546,7 +546,7 @@ static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
   out[15] = step1;
 }
 
-static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+static inline void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
 
@@ -667,7 +667,7 @@ static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
@@ -777,7 +777,7 @@ static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+static inline void iadst16_neon(int16x8_t *const in, int16x8_t *out,
                                 int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
@@ -934,7 +934,7 @@ static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
   out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+static inline void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
@@ -1025,7 +1025,7 @@ static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
   out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+static inline void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
@@ -1167,7 +1167,7 @@ static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
   out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+static inline void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
 
@@ -1450,7 +1450,7 @@ static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -1506,7 +1506,7 @@ static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
   out[31] = step1;
 }
 
-static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
@@ -1752,7 +1752,7 @@ static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
@@ -2014,7 +2014,7 @@ static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
   out[30] = vqsubq_s16(step2[1], step2[30]);
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
-static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
+static inline void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
@@ -2083,7 +2083,7 @@ static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
   step1[63] = vqaddq_s16(step2[63], step2[48]);
 }
 
-static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
+static inline void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
                                        int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
@@ -2148,7 +2148,7 @@ static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
   step2[63] = step1[63];
 }
 
-static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
@@ -2628,7 +2628,7 @@ static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
   out[63] = vqsubq_s16(step2[0], step2[63]);
 }
 
-static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+static inline void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -2717,7 +2717,7 @@ static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
   out[63] = step1;
 }
 
-static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
@@ -3094,7 +3094,7 @@ static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
   out[63] = vqsubq_s16(step2[0], step2[63]);
 }
 
-static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+static inline void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
@@ -3570,7 +3570,7 @@ static const transform_neon
         { NULL, NULL, NULL, NULL } }
     };
 
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+static inline void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
                                                   uint8_t *output, int stride,
                                                   TX_TYPE tx_type,
                                                   TX_SIZE tx_size, int eob) {
@@ -3624,7 +3624,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+static inline void lowbd_inv_txfm2d_add_v_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   int16x8_t a[16 * 2];
@@ -3691,7 +3691,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+static inline void lowbd_inv_txfm2d_add_h_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   int16x8_t a[16 * 2];
@@ -3749,7 +3749,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+static inline void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
                                                  uint8_t *output, int stride,
                                                  TX_TYPE tx_type, int eob) {
   (void)eob;
@@ -4075,7 +4075,7 @@ static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+static inline void lowbd_inv_txfm2d_add_no_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   int16x8_t a[64 * 8];
@@ -4147,7 +4147,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+static inline void lowbd_inv_txfm2d_add_universe_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   switch (tx_type) {
diff --git a/av1/common/arm/av1_inv_txfm_neon.h b/av1/common/arm/av1_inv_txfm_neon.h
index 1fb9066876..b457cbc9da 100644
--- a/av1/common/arm/av1_inv_txfm_neon.h
+++ b/av1/common/arm/av1_inv_txfm_neon.h
@@ -116,7 +116,7 @@ static int eob_fill[32] = {
   31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
 };
 
-static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
                                               TX_SIZE tx_size, int eob) {
   if (eob == 1) {
     *eobx = 0;
@@ -131,7 +131,7 @@ static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
   *eoby = eobxy >> 8;
 }
 
-static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size, int eob) {
   eob -= 1;
   const int txfm_size_row = tx_size_high[tx_size];
@@ -140,7 +140,7 @@ static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
   *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
 }
 
-static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size, int eob) {
   eob -= 1;
   const int txfm_size_col = tx_size_wide[tx_size];
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 8320c71523..be6df922e0 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -80,7 +80,7 @@ void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
 // and const2.
-static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
+static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
                                                int16x8_t partialb,
                                                uint32x4_t const1,
                                                uint32x4_t const2) {
@@ -146,7 +146,7 @@ static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
 // two of them to compute each half of the new configuration, and pad the empty
 // spaces with zeros. Similar shifting is done for other directions, except
 // direction 6 which is straightforward as it's the vertical direction.
-static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
+static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
                                                       uint32_t cost[4]) {
   const int16x8_t zero = vdupq_n_s16(0);
 
@@ -226,7 +226,7 @@ static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
   return costs[0];
 }
 
-static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
+static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
                                                         int16x8_t partialb,
                                                         int16x8_t partialc,
                                                         uint32x4_t const0) {
@@ -448,7 +448,7 @@ void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
 }
 
 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
+static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
                                     unsigned int threshold, int adjdamp) {
   uint16x8_t diff = vabdq_u16(a, b);
   const uint16x8_t a_gt_b = vcgtq_u16(a, b);
@@ -458,7 +458,7 @@ static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
   return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
 }
 
-static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4],
+static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4],
                                   const int *pri_taps, int pri_strength,
                                   int pri_damping, int16x8_t *sum) {
   // Near taps
@@ -476,7 +476,7 @@ static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4],
   *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
 }
 
-static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
+static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
                                     const int *sec_taps, int sec_strength,
                                     int sec_damping, int16x8_t *sum) {
   // Near taps
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 1d44aeaec4..c1763ff8b7 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -15,28 +15,28 @@
 
 #include "av1/common/cfl.h"
 
-static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
                                  int16x8_t sub) {
   vst1q_s16(dst + offset,
             vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
 }
 
-static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+static inline uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
   return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
 }
 
 // Load half of a vector and duplicated in other half
-static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+static inline uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
   return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
 }
 
 // Store half of a vector.
-static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+static inline void vsth_u16(uint16_t *ptr, uint16x4_t val) {
   vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0);
 }
 
 // Store half of a vector.
-static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+static inline void vsth_u8(uint8_t *ptr, uint8x8_t val) {
   vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0);
 }
 
@@ -253,7 +253,7 @@ static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
 
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
-static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+static inline void subtract_average_neon(const uint16_t *src, int16_t *dst,
                                          int width, int height,
                                          int round_offset,
                                          const int num_pel_log2) {
@@ -407,7 +407,7 @@ static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(vaddq_s16(a, mask), mask);
 }
 
-static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+static inline int16x4_t predict_w4(const int16_t *pred_buf_q3,
                                    int16x4_t alpha_sign, int abs_alpha_q12,
                                    int16x4_t dc) {
   const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
@@ -416,7 +416,7 @@ static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
   return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
 }
 
-static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+static inline int16x8_t predict_w8(const int16_t *pred_buf_q3,
                                    int16x8_t alpha_sign, int abs_alpha_q12,
                                    int16x8_t dc) {
   const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
@@ -425,7 +425,7 @@ static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
   return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
 }
 
-static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
                                       int16x8_t alpha_sign, int abs_alpha_q12,
                                       int16x8_t dc) {
   // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
@@ -444,7 +444,7 @@ static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
   return result;
 }
 
-static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
                                       int16x8_t alpha_sign, int abs_alpha_q12,
                                       int16x8_t dc) {
   // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
@@ -471,7 +471,7 @@ static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
   return result;
 }
 
-static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
                                         uint8_t *dst, int dst_stride,
                                         int alpha_q3, int width, int height) {
   const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
@@ -515,15 +515,15 @@ static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
 CFL_PREDICT_FN(neon, lbd)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+static inline uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
   return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
 }
 
-static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+static inline uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
   return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
 }
 
-static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+static inline uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
   uint16x8x2_t result;
   result.val[0] = vreinterpretq_u16_s16(
       vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
@@ -532,7 +532,7 @@ static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
   return result;
 }
 
-static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+static inline uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
   uint16x8x4_t result;
   result.val[0] = vreinterpretq_u16_s16(
       vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
@@ -545,7 +545,7 @@ static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
   return result;
 }
 
-static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
                                         uint16_t *dst, int dst_stride,
                                         int alpha_q3, int bd, int width,
                                         int height) {
diff --git a/av1/common/arm/compound_convolve_neon.c b/av1/common/arm/compound_convolve_neon.c
index cae9b4bc8f..c0cd25d09d 100644
--- a/av1/common/arm/compound_convolve_neon.c
+++ b/av1/common/arm/compound_convolve_neon.c
@@ -18,7 +18,7 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
-static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
                                          const int16x4_t s2, const int16x4_t s3,
                                          const int16x4_t x_filter,
                                          const int16x4_t horiz_const) {
@@ -32,7 +32,7 @@ static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
   return vshr_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x8_t s4, const int16x8_t s5,
                                          const int16x8_t s6, const int16x8_t s7,
@@ -55,7 +55,7 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void dist_wtd_convolve_2d_horiz_neon(
+static inline void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16_t *x_filter_ptr, const int im_h, int w) {
   const int bd = 8;
@@ -294,7 +294,7 @@ void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, ConvolveParams *conv_params) {
   assert(w % 4 == 0);
@@ -384,7 +384,7 @@ static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
+static inline void dist_wtd_convolve_2d_copy_avg_neon(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, ConvolveParams *conv_params) {
   assert(w % 4 == 0);
@@ -470,7 +470,7 @@ static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
+static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
                                                   int src_stride, int w, int h,
                                                   ConvolveParams *conv_params) {
   assert(w % 4 == 0);
@@ -551,7 +551,7 @@ void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
   }
 }
 
-static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
                                        const int16x4_t s2, const int16x4_t s3,
                                        const int16x4_t x_filter,
                                        const int16x4_t round_offset) {
@@ -565,7 +565,7 @@ static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
   return vreinterpret_u16_s16(res);
 }
 
-static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
                                        const int16x8_t s2, const int16x8_t s3,
                                        const int16x8_t s4, const int16x8_t s5,
                                        const int16x8_t s6, const int16x8_t s7,
@@ -588,7 +588,7 @@ static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
   return vreinterpretq_u16_s16(res);
 }
 
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_x_dist_wtd_avg_neon(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -805,7 +805,7 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_avg_neon(
+static inline void dist_wtd_convolve_x_avg_neon(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -1015,7 +1015,7 @@ static INLINE void dist_wtd_convolve_x_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_neon(
+static inline void dist_wtd_convolve_x_neon(
     const uint8_t *src, int src_stride, int w, int h,
     const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -1209,7 +1209,7 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
   }
 }
 
-static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
                                        const int16x4_t s2, const int16x4_t s3,
                                        const int16x4_t s4, const int16x4_t s5,
                                        const int16x8_t y_filter,
@@ -1230,7 +1230,7 @@ static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
   return vreinterpret_u16_s16(res);
 }
 
-static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
                                        const int16x8_t s2, const int16x8_t s3,
                                        const int16x8_t s4, const int16x8_t s5,
                                        const int16x8_t y_filter,
@@ -1251,7 +1251,7 @@ static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
   return vreinterpretq_u16_s16(res);
 }
 
-static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
     const int dst8_stride, int w, int h, const int16x8_t y_filter,
     ConvolveParams *conv_params) {
@@ -1471,7 +1471,7 @@ static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
+static inline void dist_wtd_convolve_y_6tap_avg_neon(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
     const int dst8_stride, int w, int h, const int16x8_t y_filter,
     ConvolveParams *conv_params) {
@@ -1684,7 +1684,7 @@ static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
+static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
                                                  int src_stride, int w, int h,
                                                  const int16x8_t y_filter,
                                                  ConvolveParams *conv_params) {
@@ -1854,7 +1854,7 @@ static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
   }
 }
 
-static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
                                        const int16x4_t s2, const int16x4_t s3,
                                        const int16x4_t s4, const int16x4_t s5,
                                        const int16x4_t s6, const int16x4_t s7,
@@ -1877,7 +1877,7 @@ static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
   return vreinterpret_u16_s16(res);
 }
 
-static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
                                        const int16x8_t s2, const int16x8_t s3,
                                        const int16x8_t s4, const int16x8_t s5,
                                        const int16x8_t s6, const int16x8_t s7,
@@ -1900,7 +1900,7 @@ static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
   return vreinterpretq_u16_s16(res);
 }
 
-static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
     const int dst8_stride, int w, int h, const int16x8_t y_filter,
     ConvolveParams *conv_params) {
@@ -2181,7 +2181,7 @@ static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
+static inline void dist_wtd_convolve_y_8tap_avg_neon(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
     const int dst8_stride, int w, int h, const int16x8_t y_filter,
     ConvolveParams *conv_params) {
@@ -2455,7 +2455,7 @@ static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
+static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
                                                  int src_stride, int w, int h,
                                                  const int16x8_t y_filter,
                                                  ConvolveParams *conv_params) {
diff --git a/av1/common/arm/compound_convolve_neon.h b/av1/common/arm/compound_convolve_neon.h
index c72e1680b2..e9837f0316 100644
--- a/av1/common/arm/compound_convolve_neon.h
+++ b/av1/common/arm/compound_convolve_neon.h
@@ -17,7 +17,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/filter.h"
 
-static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+static inline void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
                                             const uint16_t fwd_offset,
                                             const uint16_t bck_offset,
                                             const int16x4_t round_offset,
@@ -34,7 +34,7 @@ static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
   *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+static inline void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
                                          const int16x4_t round_offset,
                                          uint8x8_t *d0_u8) {
   uint16x4_t avg0 = vhadd_u16(dd0, d0);
@@ -46,7 +46,7 @@ static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
   *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+static inline void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
                                             const uint16_t fwd_offset,
                                             const uint16_t bck_offset,
                                             const int16x8_t round_offset,
@@ -64,7 +64,7 @@ static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
   *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+static inline void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
                                          const int16x8_t round_offset,
                                          uint8x8_t *d0_u8) {
   uint16x8_t avg0 = vhaddq_u16(dd0, d0);
@@ -74,7 +74,7 @@ static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
   *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_dist_wtd_avg_4x4(
+static inline void compute_dist_wtd_avg_4x4(
     uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
@@ -103,7 +103,7 @@ static INLINE void compute_dist_wtd_avg_4x4(
   *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
+static inline void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
                                          uint16x4_t dd2, uint16x4_t dd3,
                                          uint16x4_t d0, uint16x4_t d1,
                                          uint16x4_t d2, uint16x4_t d3,
@@ -124,7 +124,7 @@ static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
   *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_dist_wtd_avg_8x4(
+static inline void compute_dist_wtd_avg_8x4(
     uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
@@ -170,7 +170,7 @@ static INLINE void compute_dist_wtd_avg_8x4(
   *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
+static inline void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
                                          uint16x8_t dd2, uint16x8_t dd3,
                                          uint16x8_t d0, uint16x8_t d1,
                                          uint16x8_t d2, uint16x8_t d3,
@@ -193,10 +193,10 @@ static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
   *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE uint16x4_t
-convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
+static inline uint16x4_t convolve6_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t y_filter, const int32x4_t offset_const) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -212,10 +212,10 @@ convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t
-convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
+static inline uint16x8_t convolve6_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t y_filter, const int32x4_t offset_const) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -240,7 +240,7 @@ convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
     int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
@@ -396,7 +396,7 @@ static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
+static inline void dist_wtd_convolve_2d_vert_6tap_avg_neon(
     int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
@@ -546,7 +546,7 @@ static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
+static inline void dist_wtd_convolve_2d_vert_6tap_neon(
     int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
     const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
@@ -662,11 +662,11 @@ static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t
-convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                 const int16x4_t s6, const int16x4_t s7,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
+static inline uint16x4_t convolve8_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+    const int32x4_t offset_const) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -683,11 +683,11 @@ convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t
-convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                 const int16x8_t s6, const int16x8_t s7,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
+static inline uint16x8_t convolve8_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    const int32x4_t offset_const) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -715,7 +715,7 @@ convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+static inline void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
     int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
@@ -879,7 +879,7 @@ static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
+static inline void dist_wtd_convolve_2d_vert_8tap_avg_neon(
     int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
@@ -1037,7 +1037,7 @@ static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
   }
 }
 
-static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
+static inline void dist_wtd_convolve_2d_vert_8tap_neon(
     int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
     const int16x8_t y_filter, int h, int w) {
   const int bd = 8;
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c
index f7261d783f..dcf8da1869 100644
--- a/av1/common/arm/compound_convolve_neon_dotprod.c
+++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -23,7 +23,7 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+static inline int16x4_t convolve4_4_2d_h(uint8x16_t samples,
                                          const int8x8_t x_filter,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -43,7 +43,7 @@ static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t x_filter,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -76,7 +76,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum[1], ROUND0_BITS - 1));
 }
 
-static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
+static inline void dist_wtd_convolve_2d_horiz_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16_t *x_filter_ptr, const int im_h, int w) {
   const int bd = 8;
@@ -255,7 +255,7 @@ void av1_dist_wtd_convolve_2d_neon_dotprod(
   }
 }
 
-static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+static inline uint16x4_t convolve4_4_x(uint8x16_t samples,
                                        const int8x8_t x_filter,
                                        const int32x4_t correction,
                                        const uint8x16_t range_limit,
@@ -275,7 +275,7 @@ static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
   return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
 }
 
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+static inline uint16x8_t convolve8_8_x(uint8x16_t samples,
                                        const int8x8_t x_filter,
                                        const int32x4_t correction,
                                        const uint8x16_t range_limit,
@@ -309,7 +309,7 @@ static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
   return vreinterpretq_u16_s16(res);
 }
 
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -432,7 +432,7 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
+static inline void dist_wtd_convolve_x_avg_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -551,7 +551,7 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_neon_dotprod(
+static inline void dist_wtd_convolve_x_neon_dotprod(
     const uint8_t *src, int src_stride, int w, int h,
     const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index 9801ad8ce9..65f48958f1 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -23,7 +23,7 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+static inline int16x4_t convolve4_4_2d_h(uint8x16_t samples,
                                          const int8x8_t x_filter,
                                          const uint8x16_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -38,7 +38,7 @@ static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t x_filter,
                                          const uint8x16x3_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -66,7 +66,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum[1], ROUND0_BITS - 1));
 }
 
-static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
+static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16_t *x_filter_ptr, const int im_h, int w) {
   const int bd = 8;
@@ -230,7 +230,7 @@ void av1_dist_wtd_convolve_2d_neon_i8mm(
   }
 }
 
-static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+static inline uint16x4_t convolve4_4_x(uint8x16_t samples,
                                        const int8x8_t x_filter,
                                        const uint8x16_t permute_tbl,
                                        const int32x4_t round_offset) {
@@ -245,7 +245,7 @@ static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
   return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
 }
 
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+static inline uint16x8_t convolve8_8_x(uint8x16_t samples,
                                        const int8x8_t x_filter,
                                        const uint8x16x3_t permute_tbl,
                                        const int32x4_t round_offset) {
@@ -274,7 +274,7 @@ static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
   return vreinterpretq_u16_s16(res);
 }
 
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -390,7 +390,7 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
+static inline void dist_wtd_convolve_x_avg_neon_i8mm(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
@@ -502,7 +502,7 @@ static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
   }
 }
 
-static INLINE void dist_wtd_convolve_x_neon_i8mm(
+static inline void dist_wtd_convolve_x_neon_i8mm(
     const uint8_t *src, int src_stride, int w, int h,
     const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 35aa8122f4..bc7c71128d 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -24,7 +24,7 @@
 #include "av1/common/filter.h"
 #include "av1/common/arm/convolve_neon.h"
 
-static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
                                        const int16x4_t s2, const int16x4_t s3,
                                        const int16x4_t s4, const int16x4_t s5,
                                        const int16x4_t s6, const int16x4_t s7,
@@ -53,7 +53,7 @@ static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
+static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
                                             int src_stride, uint8_t *dst_ptr,
                                             const int dst_stride, int w, int h,
                                             const int16_t *x_filter_ptr) {
@@ -188,7 +188,7 @@ static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
 #endif  // AOM_ARCH_AARCH64
 }
 
-static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x4_t filter,
                                       int16x8_t horiz_const) {
@@ -201,7 +201,7 @@ static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
+static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
                                            int src_stride, uint8_t *dst_ptr,
                                            const int dst_stride, int w, int h,
                                            const int16_t *x_filter_ptr) {
@@ -279,7 +279,7 @@ static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
   }
 }
 
-static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t s6, const int16x8_t s7,
@@ -460,7 +460,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x4_t filter) {
   int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
@@ -472,7 +472,7 @@ static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
+static inline void convolve_y_sr_4tap_neon(const uint8_t *src,
                                            const int src_stride, uint8_t *dst,
                                            const int dst_stride, int w, int h,
                                            const int16_t *filter_y) {
@@ -557,7 +557,7 @@ static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
   }
 }
 
-static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x8_t y_filter_0_7) {
@@ -575,7 +575,7 @@ static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t y_filters) {
@@ -593,7 +593,7 @@ static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
+static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
                                            int src_stride, uint8_t *dst_ptr,
                                            const int dst_stride, int w, int h,
                                            const int16x8_t y_filter) {
@@ -732,7 +732,7 @@ static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
   }
 }
 
-static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x4_t s6, const int16x4_t s7,
@@ -752,7 +752,7 @@ static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t s6, const int16x8_t s7,
@@ -773,7 +773,7 @@ static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
+static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
                                            int src_stride, uint8_t *dst_ptr,
                                            const int dst_stride, int w, int h,
                                            const int16x8_t y_filter) {
@@ -925,7 +925,7 @@ static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
   }
 }
 
-static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
                                        const int16x4_t s2, const int16x4_t s3,
                                        const int16x4_t s4, const int16x4_t s5,
                                        const int16x4_t s6, const int16x4_t s7,
@@ -957,7 +957,7 @@ static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
                                        const int16x8_t s2, const int16x8_t s3,
                                        const int16x8_t s4, const int16x8_t s5,
                                        const int16x8_t s6, const int16x8_t s7,
@@ -989,7 +989,7 @@ static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
+static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
                                             int src_stride, uint8_t *dst_ptr,
                                             int dst_stride, int w, int h,
                                             const int16_t *y_filter_ptr) {
@@ -1157,13 +1157,13 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-static INLINE int16x4_t
-convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                  const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-                  const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-                  const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
-                  const int32x4_t horiz_const) {
+static inline int16x4_t convolve12_4_2d_h(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
+    const int32x4_t horiz_const) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
 
@@ -1184,7 +1184,7 @@ convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE void convolve_2d_sr_horiz_12tap_neon(
+static inline void convolve_2d_sr_horiz_12tap_neon(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
@@ -1307,7 +1307,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
   } while (--h != 0);
 }
 
-static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x4_t filter,
                                          const int16x8_t horiz_const) {
@@ -1319,7 +1319,7 @@ static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_2d_sr_horiz_4tap_neon(
+static inline void convolve_2d_sr_horiz_4tap_neon(
     const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
   const int bd = 8;
@@ -1425,7 +1425,7 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon(
   }
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x8_t s4, const int16x8_t s5,
                                          const int16x8_t s6, const int16x8_t s7,
@@ -1448,7 +1448,7 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
+static inline void convolve_2d_sr_horiz_8tap_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   const int bd = 8;
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index b86d5739bf..206f3ba205 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -20,12 +20,12 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-static INLINE int32x4_t
-convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                  const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-                  const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-                  const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+static inline int32x4_t convolve12_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -45,13 +45,13 @@ convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return sum;
 }
 
-static INLINE uint8x8_t
-convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                  const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-                  const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-                  const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-                  const int16x8_t sub_const) {
+static inline uint8x8_t convolve12_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const int16x8_t sub_const) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
@@ -89,7 +89,7 @@ convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   return vqmovun_s16(res);
 }
 
-static INLINE void convolve_2d_sr_vert_12tap_neon(
+static inline void convolve_2d_sr_vert_12tap_neon(
     int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
     int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
   const int bd = 8;
@@ -199,7 +199,7 @@ static INLINE void convolve_2d_sr_vert_12tap_neon(
   }
 }
 
-static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
                                          const int16x4_t s2, const int16x4_t s3,
                                          const int16x4_t s4, const int16x4_t s5,
                                          const int16x4_t s6, const int16x4_t s7,
@@ -219,7 +219,7 @@ static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
   return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x8_t s4, const int16x8_t s5,
                                          const int16x8_t s6, const int16x8_t s7,
@@ -254,7 +254,7 @@ static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(res);
 }
 
-static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
+static inline void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
                                                  int src_stride,
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
@@ -376,7 +376,7 @@ static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
   }
 }
 
-static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
                                          const int16x4_t s2, const int16x4_t s3,
                                          const int16x4_t s4, const int16x4_t s5,
                                          const int16x8_t y_filter) {
@@ -393,7 +393,7 @@ static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
   return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x8_t s4, const int16x8_t s5,
                                          const int16x8_t y_filter,
@@ -423,7 +423,7 @@ static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(res);
 }
 
-static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
+static inline void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
                                                  int src_stride,
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
@@ -536,7 +536,7 @@ static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
   }
 }
 
-static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+static inline int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
                                          const int16x4_t s2, const int16x4_t s3,
                                          const int16x4_t y_filter) {
   int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
@@ -547,7 +547,7 @@ static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
   return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
 }
 
-static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
                                          const int16x8_t s2, const int16x8_t s3,
                                          const int16x4_t y_filter,
                                          const int16x8_t sub_const) {
@@ -569,7 +569,7 @@ static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(res);
 }
 
-static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
+static inline void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
                                                  int src_stride,
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 6e2a703065..8d0d9294da 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -36,7 +36,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+static inline int16x4_t convolve12_4_x(uint8x16_t samples,
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -67,7 +67,7 @@ static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
-static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
                                        const int8x16_t filter,
                                        const uint8x16x3_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -110,7 +110,7 @@ static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
   return vqmovun_s16(sum_s16);
 }
 
-static INLINE void convolve_x_sr_12tap_neon_dotprod(
+static inline void convolve_x_sr_12tap_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr) {
   // The no-op filter should never be used here.
@@ -173,7 +173,7 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_x(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -198,7 +198,7 @@ static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
   return vmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
+static inline uint8x8_t convolve4_8_x(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
@@ -229,7 +229,7 @@ static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_x_sr_4tap_neon_dotprod(
+static inline void convolve_x_sr_4tap_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
   const int16x4_t x_filter = vld1_s16(filter_x + 2);
@@ -289,7 +289,7 @@ static INLINE void convolve_x_sr_4tap_neon_dotprod(
   }
 }
 
-static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
                                       const uint8x16x3_t permute_tbl) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
   int8x16_t samples_128 =
@@ -387,7 +387,7 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
   } while (h != 0);
 }
 
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b) {
   // Transpose 8-bit elements and concatenate result rows as follows:
   // a0: 00, 01, 02, 03, XX, XX, XX, XX
@@ -411,7 +411,7 @@ static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b = vreinterpretq_s8_s16(a0123);
 }
 
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b0,
                                         int8x16_t *b1) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -438,7 +438,7 @@ static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
   *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
+static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
                                        const int8x16_t s2,
                                        const int8x8_t filters_0_7,
                                        const int8x8_t filters_4_11) {
@@ -453,7 +453,7 @@ static INLINE int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve12_8_y(
+static inline uint8x8_t convolve12_8_y(
     const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo,
     const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi,
     const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
@@ -474,7 +474,7 @@ static INLINE uint8x8_t convolve12_8_y(
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_y_sr_12tap_neon_dotprod(
+static inline void convolve_y_sr_12tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr) {
   // The no-op filter should never be used here.
@@ -669,7 +669,7 @@ static INLINE void convolve_y_sr_12tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
+static inline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
                                       const int8x8_t filters) {
   // The sample range transform and permutation are performed by the caller.
   // Accumulate into 128 << FILTER_BITS to account for range transform.
@@ -681,7 +681,7 @@ static INLINE int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
+static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
                                       const int8x16_t s0_hi,
                                       const int8x16_t s1_lo,
                                       const int8x16_t s1_hi,
@@ -701,7 +701,7 @@ static INLINE uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_y_sr_8tap_neon_dotprod(
+static inline void convolve_y_sr_8tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
@@ -887,7 +887,7 @@ void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
                                   y_filter_ptr);
 }
 
-static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples,
                                           const int8x16_t filters,
                                           const int32x4_t horiz_const,
                                           const uint8x16x3_t permute_tbl) {
@@ -912,7 +912,7 @@ static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                                           const int8x16_t filters,
                                           const int32x4_t correction,
                                           const uint8x16x3_t permute_tbl) {
@@ -947,7 +947,7 @@ static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                       vshrn_n_s32(sum4567, ROUND0_BITS));
 }
 
-static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
+static inline void convolve_2d_sr_horiz_12tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
@@ -1046,7 +1046,7 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
   }
 }
 
-static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16_t permute_tbl,
                                          const int32x4_t correction) {
@@ -1065,7 +1065,7 @@ static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
+static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x2_t permute_tbl,
                                          const int32x4_t correction) {
@@ -1089,7 +1089,7 @@ static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod(
+static inline void convolve_2d_sr_horiz_4tap_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
   const int bd = 8;
@@ -1181,7 +1181,7 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod(
   }
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16x3_t permute_tbl) {
@@ -1210,7 +1210,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
+static inline void convolve_2d_sr_horiz_8tap_neon_dotprod(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
@@ -1276,7 +1276,7 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod(
   } while (--height != 0);
 }
 
-static INLINE void convolve_2d_sr_6tap_neon_dotprod(
+static inline void convolve_2d_sr_6tap_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -1350,7 +1350,7 @@ static INLINE void convolve_2d_sr_6tap_neon_dotprod(
   } while (w != 0);
 }
 
-static INLINE void convolve_2d_sr_4tap_neon_dotprod(
+static inline void convolve_2d_sr_4tap_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   const int bd = 8;
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 2d31054f89..dd4a34e0b0 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -31,7 +31,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE int16x4_t convolve12_4_x(uint8x16_t samples[2],
+static inline int16x4_t convolve12_4_x(uint8x16_t samples[2],
                                        const int8x16_t filter[2],
                                        const uint8x16_t permute_tbl,
                                        const int32x4_t horiz_const) {
@@ -49,7 +49,7 @@ static INLINE int16x4_t convolve12_4_x(uint8x16_t samples[2],
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
-static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
                                        const int8x16_t filter[2],
                                        const uint8x16x2_t permute_tbl,
                                        const int32x4_t horiz_const) {
@@ -76,7 +76,7 @@ static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
   return vqmovun_s16(sum_s16);
 }
 
-static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
+static inline void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
                                                  int src_stride, uint8_t *dst,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *x_filter_ptr) {
@@ -157,7 +157,7 @@ static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
   }
 }
 
-static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
                                       const uint8x16x3_t permute_tbl,
                                       const int32x4_t horiz_const) {
   // Permute samples ready for dot product.
@@ -179,7 +179,7 @@ static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
   return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_x_sr_8tap_neon_i8mm(
+static inline void convolve_x_sr_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
     const int32x4_t horiz_const) {
@@ -213,7 +213,7 @@ static INLINE void convolve_x_sr_8tap_neon_i8mm(
   } while (height != 0);
 }
 
-static INLINE uint8x8_t convolve6_8_x(uint8x16_t samples,
+static inline uint8x8_t convolve6_8_x(uint8x16_t samples,
                                       const int8x16_t filter,
                                       const uint8x16x2_t permute_tbl,
                                       const int32x4_t horiz_const) {
@@ -233,7 +233,7 @@ static INLINE uint8x8_t convolve6_8_x(uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_x_sr_6tap_neon_i8mm(
+static inline void convolve_x_sr_6tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
     const int32x4_t horiz_const) {
@@ -271,7 +271,7 @@ static INLINE void convolve_x_sr_6tap_neon_i8mm(
   } while (height != 0);
 }
 
-static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_x(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16_t permute_tbl,
                                       const int32x4_t horiz_const) {
@@ -285,7 +285,7 @@ static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples,
   return vmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
+static inline uint8x8_t convolve4_8_x(const uint8x16_t samples,
                                       const int8x8_t filters,
                                       const uint8x16x2_t permute_tbl,
                                       const int32x4_t horiz_const) {
@@ -305,7 +305,7 @@ static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples,
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void convolve_x_sr_4tap_neon_i8mm(
+static inline void convolve_x_sr_4tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
     const int32x4_t horiz_const) {
@@ -412,7 +412,7 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                x_filter_ptr, horiz_const);
 }
 
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
                                         uint8x16_t *b) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -437,7 +437,7 @@ static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
   *b = vreinterpretq_u8_u16(a0123);
 }
 
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
                                         uint8x16_t *b0, uint8x16_t *b1) {
   // Transpose 8-bit elements and concatenate result rows as follows:
@@ -464,7 +464,7 @@ static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
   *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
-static INLINE int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
+static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
                                        const uint8x16_t s2,
                                        const int8x8_t filters_0_7,
                                        const int8x8_t filters_4_11) {
@@ -476,7 +476,7 @@ static INLINE int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve12_8_y(
+static inline uint8x8_t convolve12_8_y(
     const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo,
     const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi,
     const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
@@ -493,7 +493,7 @@ static INLINE uint8x8_t convolve12_8_y(
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
+static inline void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
                                                  int src_stride,
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
@@ -656,7 +656,7 @@ static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
   }
 }
 
-static INLINE int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
+static inline int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
                                       const int8x8_t filters) {
   int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
   sum = vusdotq_lane_s32(sum, s1, filters, 1);
@@ -665,7 +665,7 @@ static INLINE int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
   return vqmovn_s32(sum);
 }
 
-static INLINE uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
+static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
                                       const uint8x16_t s0_hi,
                                       const uint8x16_t s1_lo,
                                       const uint8x16_t s1_hi,
@@ -681,7 +681,7 @@ static INLINE uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
+static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 uint8_t *dst_ptr,
                                                 int dst_stride, int w, int h,
@@ -842,7 +842,7 @@ void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
                                y_filter_ptr);
 }
 
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x3_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -868,7 +868,7 @@ static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm(
+static inline void convolve_2d_sr_horiz_8tap_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
     int im_h, const int16_t *x_filter_ptr) {
   // Filter values are even, so halve to reduce intermediate precision reqs.
@@ -931,7 +931,7 @@ static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm(
   } while (--height != 0);
 }
 
-static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
+static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -945,7 +945,7 @@ static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
+static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
                                          const int8x8_t filters,
                                          const uint8x16x2_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -966,7 +966,7 @@ static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
+static inline void convolve_2d_sr_horiz_4tap_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width,
     int height, const int16_t *filter_x) {
   const int bd = 8;
@@ -1054,7 +1054,7 @@ static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm(
   }
 }
 
-static INLINE int16x4_t convolve6_4_2d_h(uint8x16_t samples,
+static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples,
                                          const int8x16_t filter,
                                          const uint8x16_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -1070,7 +1070,7 @@ static INLINE int16x4_t convolve6_4_2d_h(uint8x16_t samples,
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static INLINE int16x8_t convolve6_8_2d_h(uint8x16_t samples,
+static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples,
                                          const int8x16_t filter,
                                          const uint8x16x2_t permute_tbl,
                                          const int32x4_t horiz_const) {
@@ -1091,7 +1091,7 @@ static INLINE int16x8_t convolve6_8_2d_h(uint8x16_t samples,
                       vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
+static inline void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
                                                  int src_stride, uint8_t *dst,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *x_filter_ptr,
@@ -1169,7 +1169,7 @@ static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
   } while (w != 0);
 }
 
-static INLINE void convolve_2d_sr_6tap_4tap_neon_i8mm(
+static inline void convolve_2d_sr_6tap_4tap_neon_i8mm(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h
index fcbdd2bf5c..71b74612b5 100644
--- a/av1/common/arm/convolve_neon_i8mm.h
+++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -36,7 +36,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
   // clang-format on
 };
 
-static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples[2],
+static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples[2],
                                           const int8x16_t filter[2],
                                           const uint8x16_t permute_tbl,
                                           int32x4_t horiz_const) {
@@ -55,7 +55,7 @@ static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples[2],
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                                           const int8x16_t filter[2],
                                           const uint8x16x2_t permute_tbl,
                                           const int32x4_t horiz_const) {
@@ -81,7 +81,7 @@ static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
                       vshrn_n_s32(sum4567, ROUND0_BITS));
 }
 
-static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+static inline void convolve_2d_sr_horiz_12tap_neon_i8mm(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16_t *x_filter_ptr) {
   // The no-op filter should never be used here.
diff --git a/av1/common/arm/convolve_scale_neon.h b/av1/common/arm/convolve_scale_neon.h
index c164ff3854..757049459f 100644
--- a/av1/common/arm/convolve_scale_neon.h
+++ b/av1/common/arm/convolve_scale_neon.h
@@ -21,7 +21,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
-static INLINE int16x4_t compound_convolve8_4_v(
+static inline int16x4_t compound_convolve8_4_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
@@ -42,7 +42,7 @@ static INLINE int16x4_t compound_convolve8_4_v(
   return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE int16x8_t compound_convolve8_8_v(
+static inline int16x8_t compound_convolve8_8_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
@@ -76,7 +76,7 @@ static INLINE int16x8_t compound_convolve8_8_v(
   return vcombine_s16(res0, res1);
 }
 
-static INLINE void compound_convolve_vert_scale_8tap_neon(
+static inline void compound_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
@@ -139,7 +139,7 @@ static INLINE void compound_convolve_vert_scale_8tap_neon(
   }
 }
 
-static INLINE void compound_avg_convolve_vert_scale_8tap_neon(
+static inline void compound_avg_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     int subpel_y_qn, int y_step_qn) {
@@ -225,7 +225,7 @@ static INLINE void compound_avg_convolve_vert_scale_8tap_neon(
   }
 }
 
-static INLINE void compound_dist_wtd_convolve_vert_scale_8tap_neon(
+static inline void compound_dist_wtd_convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
@@ -334,7 +334,7 @@ static INLINE void compound_dist_wtd_convolve_vert_scale_8tap_neon(
   }
 }
 
-static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
+static inline uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x4_t s6, const int16x4_t s7,
@@ -358,7 +358,7 @@ static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
   return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
 }
 
-static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t s6, const int16x8_t s7,
@@ -393,7 +393,7 @@ static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(vcombine_s16(res0, res1));
 }
 
-static INLINE void convolve_vert_scale_8tap_neon(
+static inline void convolve_vert_scale_8tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
@@ -477,7 +477,7 @@ static INLINE void convolve_vert_scale_8tap_neon(
   }
 }
 
-static INLINE int16x4_t compound_convolve6_4_v(
+static inline int16x4_t compound_convolve6_4_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x8_t filter, const int32x4_t offset_const) {
@@ -496,7 +496,7 @@ static INLINE int16x4_t compound_convolve6_4_v(
   return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE int16x8_t compound_convolve6_8_v(
+static inline int16x8_t compound_convolve6_8_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t filter, const int32x4_t offset_const) {
@@ -526,7 +526,7 @@ static INLINE int16x8_t compound_convolve6_8_v(
   return vcombine_s16(res0, res1);
 }
 
-static INLINE void compound_convolve_vert_scale_6tap_neon(
+static inline void compound_convolve_vert_scale_6tap_neon(
     const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
@@ -589,7 +589,7 @@ static INLINE void compound_convolve_vert_scale_6tap_neon(
   }
 }
 
-static INLINE void compound_avg_convolve_vert_scale_6tap_neon(
+static inline void compound_avg_convolve_vert_scale_6tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     int subpel_y_qn, int y_step_qn) {
@@ -675,7 +675,7 @@ static INLINE void compound_avg_convolve_vert_scale_6tap_neon(
   }
 }
 
-static INLINE void compound_dist_wtd_convolve_vert_scale_6tap_neon(
+static inline void compound_dist_wtd_convolve_vert_scale_6tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
     uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
     ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
@@ -784,7 +784,7 @@ static INLINE void compound_dist_wtd_convolve_vert_scale_6tap_neon(
   }
 }
 
-static INLINE uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1,
+static inline uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x8_t filter,
@@ -806,7 +806,7 @@ static INLINE uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1,
   return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
 }
 
-static INLINE uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1,
+static inline uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1,
                                       const int16x8_t s2, const int16x8_t s3,
                                       const int16x8_t s4, const int16x8_t s5,
                                       const int16x8_t filter,
@@ -837,7 +837,7 @@ static INLINE uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1,
   return vqmovun_s16(vcombine_s16(res0, res1));
 }
 
-static INLINE void convolve_vert_scale_6tap_neon(
+static inline void convolve_vert_scale_6tap_neon(
     const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
   const int bd = 8;
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
index 8875f86581..536f4414b2 100644
--- a/av1/common/arm/convolve_sve2.c
+++ b/av1/common/arm/convolve_sve2.c
@@ -26,7 +26,7 @@
 #include "av1/common/arm/highbd_convolve_sve2.h"
 #include "av1/common/arm/convolve_neon_i8mm.h"
 
-static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
+static inline int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
                                                  int16x8_t s1[2],
                                                  int16x8_t s2[2],
                                                  int16x8_t filter_0_7,
@@ -42,7 +42,7 @@ static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
   return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
 }
 
-static INLINE void convolve_2d_sr_vert_12tap_sve2(
+static inline void convolve_2d_sr_vert_12tap_sve2(
     const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
     const int16x8_t y_filter_4_11) {
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index c6fc7642ce..36d113cfcf 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -23,7 +23,7 @@
 #include "av1/common/arm/highbd_compound_convolve_neon.h"
 #include "av1/common/arm/highbd_convolve_neon.h"
 
-static INLINE uint16x4_t highbd_12_convolve6_4(
+static inline uint16x4_t highbd_12_convolve6_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x8_t filter, const int32x4_t offset) {
@@ -41,10 +41,10 @@ static INLINE uint16x4_t highbd_12_convolve6_4(
   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x4_t
-highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                   const int16x8_t filter, const int32x4_t offset) {
+static inline uint16x4_t highbd_convolve6_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t filter, const int32x4_t offset) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t filter_0_3 = vget_low_s16(filter);
   const int16x4_t filter_4_7 = vget_high_s16(filter);
@@ -59,7 +59,7 @@ highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vqshrun_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE uint16x8_t highbd_12_convolve6_8(
+static inline uint16x8_t highbd_12_convolve6_8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t filter, const int32x4_t offset) {
@@ -85,10 +85,10 @@ static INLINE uint16x8_t highbd_12_convolve6_8(
                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
 }
 
-static INLINE uint16x8_t
-highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                   const int16x8_t filter, const int32x4_t offset) {
+static inline uint16x8_t highbd_convolve6_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t filter, const int32x4_t offset) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t filter_0_3 = vget_low_s16(filter);
   const int16x4_t filter_4_7 = vget_high_s16(filter);
@@ -110,7 +110,7 @@ highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
+static inline void highbd_12_dist_wtd_convolve_x_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   const int32x4_t offset_vec = vdupq_n_s32(offset);
@@ -156,7 +156,7 @@ static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
   } while (height != 0);
 }
 
-static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
+static inline void highbd_dist_wtd_convolve_x_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   const int32x4_t offset_vec = vdupq_n_s32(offset);
@@ -202,7 +202,7 @@ static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
   } while (height != 0);
 }
 
-static INLINE uint16x4_t highbd_12_convolve8_4(
+static inline uint16x4_t highbd_12_convolve8_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
@@ -222,11 +222,11 @@ static INLINE uint16x4_t highbd_12_convolve8_4(
   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x4_t
-highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                   const int16x4_t s6, const int16x4_t s7,
-                   const int16x8_t filter, const int32x4_t offset) {
+static inline uint16x4_t highbd_convolve8_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int32x4_t offset) {
   const int16x4_t filter_0_3 = vget_low_s16(filter);
   const int16x4_t filter_4_7 = vget_high_s16(filter);
 
@@ -242,7 +242,7 @@ highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vqshrun_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE uint16x8_t highbd_12_convolve8_8(
+static inline uint16x8_t highbd_12_convolve8_8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
@@ -272,11 +272,11 @@ static INLINE uint16x8_t highbd_12_convolve8_8(
                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
 }
 
-static INLINE uint16x8_t
-highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                   const int16x8_t s6, const int16x8_t s7,
-                   const int16x8_t filter, const int32x4_t offset) {
+static inline uint16x8_t highbd_convolve8_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int32x4_t offset) {
   const int16x4_t filter_0_3 = vget_low_s16(filter);
   const int16x4_t filter_4_7 = vget_high_s16(filter);
 
@@ -302,7 +302,7 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                       vqshrun_n_s32(sum1, ROUND0_BITS));
 }
 
-static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
+static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
                                                  const int16x4_t x_filter,
                                                  const int32x4_t offset) {
   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
@@ -313,7 +313,7 @@ static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
   return vqshrun_n_s32(sum, 5);
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
                                               const int16x4_t x_filter,
                                               const int32x4_t offset) {
   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
@@ -324,7 +324,7 @@ static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
   return vqshrun_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_x_neon(
+static inline void highbd_12_dist_wtd_convolve_x_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   const int32x4_t offset_vec = vdupq_n_s32(offset);
@@ -399,7 +399,7 @@ static INLINE void highbd_12_dist_wtd_convolve_x_neon(
   }
 }
 
-static INLINE void highbd_dist_wtd_convolve_x_neon(
+static inline void highbd_dist_wtd_convolve_x_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   const int32x4_t offset_vec = vdupq_n_s32(offset);
@@ -553,7 +553,7 @@ void av1_highbd_dist_wtd_convolve_x_neon(
   }
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
+static inline void highbd_12_dist_wtd_convolve_y_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -632,7 +632,7 @@ static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
   }
 }
 
-static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
+static inline void highbd_dist_wtd_convolve_y_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -711,7 +711,7 @@ static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_12_convolve4_4(
+static inline uint16x4_t highbd_12_convolve4_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
@@ -722,7 +722,7 @@ static INLINE uint16x4_t highbd_12_convolve4_4(
   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x8_t highbd_12_convolve4_8(
+static inline uint16x8_t highbd_12_convolve4_8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
@@ -739,7 +739,7 @@ static INLINE uint16x8_t highbd_12_convolve4_8(
                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_y_4tap_neon(
+static inline void highbd_12_dist_wtd_convolve_y_4tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
@@ -816,7 +816,7 @@ static INLINE void highbd_12_dist_wtd_convolve_y_4tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4(
+static inline uint16x4_t highbd_convolve4_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
@@ -827,7 +827,7 @@ static INLINE uint16x4_t highbd_convolve4_4(
   return vqshrun_n_s32(sum, ROUND0_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8(
+static inline uint16x8_t highbd_convolve4_8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
@@ -844,7 +844,7 @@ static INLINE uint16x8_t highbd_convolve4_8(
                       vqshrun_n_s32(sum1, ROUND0_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_y_4tap_neon(
+static inline void highbd_dist_wtd_convolve_y_4tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
@@ -917,7 +917,7 @@ static INLINE void highbd_dist_wtd_convolve_y_4tap_neon(
   }
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
+static inline void highbd_12_dist_wtd_convolve_y_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -999,7 +999,7 @@ static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
     } while (w != 0);
   }
 }
-static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
+static inline void highbd_dist_wtd_convolve_y_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -1180,7 +1180,7 @@ void av1_highbd_dist_wtd_convolve_y_neon(
   }
 }
 
-static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
+static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
                                        uint16_t *dst_ptr, int dst_stride, int w,
                                        int h, const int round_bits,
                                        const int offset) {
@@ -1260,7 +1260,7 @@ void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
   }
 }
 
-static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+static inline uint16x4_t highbd_convolve6_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x8_t y_filter, const int32x4_t offset) {
@@ -1278,7 +1278,7 @@ static INLINE uint16x4_t highbd_convolve6_4_2d_v(
   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+static inline uint16x8_t highbd_convolve6_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t y_filter, const int32x4_t offset) {
@@ -1304,7 +1304,7 @@ static INLINE uint16x8_t highbd_convolve6_8_2d_v(
                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -1383,7 +1383,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+static inline uint16x4_t highbd_convolve8_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -1403,7 +1403,7 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(
   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+static inline uint16x8_t highbd_convolve8_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -1433,7 +1433,7 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -1516,7 +1516,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
   }
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   // The smallest block height is 4, and the horizontal convolution needs to
@@ -1586,7 +1586,7 @@ static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
   } while (--height != 0);
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   // The smallest block height is 4, and the horizontal convolution needs to
@@ -1656,7 +1656,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
   } while (--height != 0);
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
+static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   // The smallest block height is 4, and the horizontal convolution needs to
@@ -1768,7 +1768,7 @@ static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
   }
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
+static inline void highbd_dist_wtd_convolve_2d_horiz_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, const int offset) {
   // The smallest block height is 4, and the horizontal convolution needs to
diff --git a/av1/common/arm/highbd_compound_convolve_neon.h b/av1/common/arm/highbd_compound_convolve_neon.h
index ae5a43be01..badaf7b1b4 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.h
+++ b/av1/common/arm/highbd_compound_convolve_neon.h
@@ -21,7 +21,7 @@
 
 #define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
 
-static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
+static inline void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
                                            int src_stride, uint16_t *dst_ptr,
                                            int dst_stride, int w, int h,
                                            ConvolveParams *conv_params) {
@@ -85,7 +85,7 @@ static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
   }
 }
 
-static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+static inline void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
                                         uint16_t *dst_ptr, int dst_stride,
                                         int w, int h,
                                         ConvolveParams *conv_params,
@@ -150,7 +150,7 @@ static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
   }
 }
 
-static INLINE void highbd_12_dist_wtd_comp_avg_neon(
+static inline void highbd_12_dist_wtd_comp_avg_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, ConvolveParams *conv_params) {
   const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2;
@@ -221,7 +221,7 @@ static INLINE void highbd_12_dist_wtd_comp_avg_neon(
   }
 }
 
-static INLINE void highbd_dist_wtd_comp_avg_neon(
+static inline void highbd_dist_wtd_comp_avg_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, ConvolveParams *conv_params, const int bd) {
   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index e5909219c3..668dfbf5f7 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -32,7 +32,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
 };
 
-static INLINE uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8],
+static inline uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8],
                                                  int16x8_t filter,
                                                  int64x2_t offset) {
   int64x2_t sum[8];
@@ -57,7 +57,7 @@ static INLINE uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8],
                       vqrshrun_n_s32(sum4567, ROUND0_BITS + 2));
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_x_8tap_sve2(
+static inline void highbd_12_dist_wtd_convolve_x_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr) {
   const int64x1_t offset_vec =
@@ -99,7 +99,7 @@ static INLINE void highbd_12_dist_wtd_convolve_x_8tap_sve2(
   } while (height != 0);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
+static inline uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
                                               int64x2_t offset) {
   int64x2_t sum[8];
   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
@@ -123,7 +123,7 @@ static INLINE uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
                       vqrshrun_n_s32(sum4567, ROUND0_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_x_8tap_sve2(
+static inline void highbd_dist_wtd_convolve_x_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr, const int bd) {
   const int64x1_t offset_vec =
@@ -171,7 +171,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 };
 // clang-format on
 
-static INLINE uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter,
+static inline uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter,
                                                  int64x2_t offset,
                                                  uint16x8x2_t permute_tbl) {
   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
@@ -185,7 +185,7 @@ static INLINE uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter,
   return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4],
+static inline uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4],
                                                  int16x8_t filter,
                                                  int64x2_t offset,
                                                  uint16x8_t tbl) {
@@ -202,7 +202,7 @@ static INLINE uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4],
   return aom_tbl_u16(res, tbl);
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_x_4tap_sve2(
+static inline void highbd_12_dist_wtd_convolve_x_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr) {
   const int64x2_t offset =
@@ -264,7 +264,7 @@ static INLINE void highbd_12_dist_wtd_convolve_x_4tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter,
+static inline uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter,
                                               int64x2_t offset,
                                               uint16x8x2_t permute_tbl) {
   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
@@ -278,7 +278,7 @@ static INLINE uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter,
   return vqrshrun_n_s32(sum0123, ROUND0_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
+static inline uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
                                               int64x2_t offset,
                                               uint16x8_t tbl) {
   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
@@ -294,7 +294,7 @@ static INLINE uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
   return aom_tbl_u16(res, tbl);
 }
 
-static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2(
+static inline void highbd_dist_wtd_convolve_x_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr, const int bd) {
   const int64x2_t offset =
@@ -438,7 +438,7 @@ void av1_highbd_dist_wtd_convolve_x_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2],
+static inline uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2],
                                                  int16x8_t samples_hi[2],
                                                  int16x8_t filter,
                                                  int64x2_t offset) {
@@ -453,7 +453,7 @@ static INLINE uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2],
   return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2);
 }
 
-static INLINE uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4],
+static inline uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4],
                                                  int16x8_t samples_hi[4],
                                                  int16x8_t filter,
                                                  int64x2_t offset) {
@@ -476,7 +476,7 @@ static INLINE uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4],
                       vqrshrun_n_s32(sum4567, ROUND0_BITS + 2));
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_y_8tap_sve2(
+static inline void highbd_12_dist_wtd_convolve_y_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr) {
   const int64x2_t offset =
@@ -614,7 +614,7 @@ static INLINE void highbd_12_dist_wtd_convolve_y_8tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
+static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
                                               int16x8_t samples_hi[2],
                                               int16x8_t filter,
                                               int64x2_t offset) {
@@ -629,7 +629,7 @@ static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
   return vqrshrun_n_s32(sum0123, ROUND0_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
+static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
                                               int16x8_t samples_hi[4],
                                               int16x8_t filter,
                                               int64x2_t offset) {
@@ -652,7 +652,7 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
                       vqrshrun_n_s32(sum4567, ROUND0_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2(
+static inline void highbd_dist_wtd_convolve_y_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr, const int bd) {
   const int64x2_t offset =
@@ -849,7 +849,7 @@ void av1_highbd_dist_wtd_convolve_y_sve2(
   }
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
+static inline void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr) {
   const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 2));
@@ -913,7 +913,7 @@ static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2(
   } while (width != 0);
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
+static inline void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr, const int bd) {
   const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 2));
@@ -977,7 +977,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2(
   } while (width != 0);
 }
 
-static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2(
+static inline void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr) {
   const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 1));
@@ -1072,7 +1072,7 @@ static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2(
   }
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
+static inline void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr, const int bd) {
   const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 1));
@@ -1166,7 +1166,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],
+static inline uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],
                                                  int16x8_t samples_hi[2],
                                                  int16x8_t filter,
                                                  int64x2_t offset) {
@@ -1181,7 +1181,7 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],
   return vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],
+static inline uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],
                                                  int16x8_t samples_hi[4],
                                                  int16x8_t filter,
                                                  int64x2_t offset) {
@@ -1204,7 +1204,7 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],
                       vqrshrun_n_s32(sum4567, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
+static inline void highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr, int offset) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -1350,7 +1350,7 @@ static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_2d_v(
+static inline uint16x4_t highbd_convolve4_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
@@ -1361,7 +1361,7 @@ static INLINE uint16x4_t highbd_convolve4_4_2d_v(
   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8_2d_v(
+static inline uint16x8_t highbd_convolve4_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
@@ -1378,7 +1378,7 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_v(
                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
 }
 
-static INLINE void highbd_dist_wtd_convolve_2d_vert_4tap_neon(
+static inline void highbd_dist_wtd_convolve_2d_vert_4tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int offset) {
   const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index f4e770ae87..17c711bbd5 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -21,10 +21,10 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 
-static INLINE uint16x4_t
-highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                     const int16x8_t y_filter, const uint16x4_t max) {
+static inline uint16x4_t highbd_convolve6_4_y(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t y_filter, const uint16x4_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -40,10 +40,10 @@ highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t
-highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                     const int16x8_t y_filter, const uint16x8_t max) {
+static inline uint16x8_t highbd_convolve6_8_y(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t y_filter, const uint16x8_t max) {
   // Values at indices 0 and 7 of y_filter are zero.
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
@@ -67,7 +67,7 @@ highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_y_sr_6tap_neon(
+static inline void highbd_convolve_y_sr_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int bd) {
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
@@ -149,11 +149,11 @@ static INLINE void highbd_convolve_y_sr_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t
-highbd_convolve8_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                     const int16x4_t s6, const int16x4_t s7,
-                     const int16x8_t y_filter, const uint16x4_t max) {
+static inline uint16x4_t highbd_convolve8_4_y(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+    const uint16x4_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -170,11 +170,11 @@ highbd_convolve8_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t
-highbd_convolve8_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                     const int16x8_t s6, const int16x8_t s7,
-                     const int16x8_t y_filter, const uint16x8_t max) {
+static inline uint16x8_t highbd_convolve8_8_y(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    const uint16x8_t max) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
 
@@ -201,7 +201,7 @@ highbd_convolve8_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_y_sr_8tap_neon(
+static inline void highbd_convolve_y_sr_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
@@ -286,7 +286,7 @@ static INLINE void highbd_convolve_y_sr_8tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_y(
+static inline uint16x4_t highbd_convolve12_4_y(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
@@ -313,7 +313,7 @@ static INLINE uint16x4_t highbd_convolve12_4_y(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve12_8_y(
+static inline uint16x8_t highbd_convolve12_8_y(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
@@ -354,7 +354,7 @@ static INLINE uint16x8_t highbd_convolve12_8_y(
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_y_sr_12tap_neon(
+static inline void highbd_convolve_y_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
@@ -490,7 +490,7 @@ void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
                                  y_filter_ptr, bd);
 }
 
-static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
+static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
                                               const int16x8_t x_filter,
                                               const int32x4_t offset,
                                               const uint16x8_t max) {
@@ -519,7 +519,7 @@ static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_6tap_neon(
+static inline void highbd_convolve_x_sr_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     int bd) {
@@ -564,7 +564,7 @@ static INLINE void highbd_convolve_x_sr_6tap_neon(
   } while (height != 0);
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
                                               const int16x4_t x_filter,
                                               const int32x4_t offset,
                                               const uint16x4_t max) {
@@ -578,7 +578,7 @@ static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
+static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
                                               const int16x8_t x_filter,
                                               const int32x4_t offset,
                                               const uint16x8_t max) {
@@ -610,7 +610,7 @@ static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
+static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
                                              int src_stride, uint16_t *dst_ptr,
                                              int dst_stride, int w, int h,
                                              const int16_t *x_filter_ptr,
@@ -683,7 +683,7 @@ static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
   }
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
+static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
                                                const int16x8_t x_filter_0_7,
                                                const int16x4_t x_filter_8_11,
                                                const int32x4_t offset,
@@ -709,7 +709,7 @@ static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
+static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
                                                const int16x8_t x_filter_0_7,
                                                const int16x4_t x_filter_8_11,
                                                const int32x4_t offset,
@@ -750,7 +750,7 @@ static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_12tap_neon(
+static inline void highbd_convolve_x_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     int bd) {
@@ -872,7 +872,7 @@ void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
                             x_filter_ptr, conv_params, bd);
 }
 
-static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+static inline uint16x4_t highbd_convolve6_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x8_t y_filter, const int32x4_t round_shift,
@@ -893,7 +893,7 @@ static INLINE uint16x4_t highbd_convolve6_4_2d_v(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+static inline uint16x8_t highbd_convolve6_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t y_filter, const int32x4_t round_shift,
@@ -923,7 +923,7 @@ static INLINE uint16x8_t highbd_convolve6_8_2d_v(
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
+static inline void highbd_convolve_2d_sr_vert_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     int bd, const int offset) {
@@ -1010,7 +1010,7 @@ static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+static inline uint16x4_t highbd_convolve8_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -1032,7 +1032,7 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+static inline uint16x8_t highbd_convolve8_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -1065,7 +1065,7 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
+static inline void highbd_convolve_2d_sr_vert_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     int bd, const int offset) {
@@ -1162,7 +1162,7 @@ static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_2d_v(
+static inline uint16x4_t highbd_convolve12_4_2d_v(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
@@ -1190,7 +1190,7 @@ static INLINE uint16x4_t highbd_convolve12_4_2d_v(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve12_8_2d_v(
+static inline uint16x8_t highbd_convolve12_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
@@ -1233,7 +1233,7 @@ static INLINE uint16x8_t highbd_convolve12_8_2d_v(
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
+static inline void highbd_convolve_2d_sr_vert_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
     const int bd, const int offset) {
@@ -1342,7 +1342,7 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
   }
 }
 
-static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
+static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
                                                  const int16x8_t x_filter,
                                                  const int32x4_t shift_s32,
                                                  const int32x4_t offset) {
@@ -1370,7 +1370,7 @@ static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
+static inline void highbd_convolve_2d_sr_horiz_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     const int offset) {
@@ -1441,7 +1441,7 @@ static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
   } while (--height != 0);
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
+static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
                                                  const int16x4_t x_filter,
                                                  const int32x4_t shift_s32,
                                                  const int32x4_t offset) {
@@ -1454,7 +1454,7 @@ static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
   return vqmovun_s32(sum);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
+static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
                                                  const int16x8_t x_filter,
                                                  const int32x4_t shift_s32,
                                                  const int32x4_t offset) {
@@ -1485,7 +1485,7 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_neon(
+static inline void highbd_convolve_2d_sr_horiz_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     const int offset) {
@@ -1601,7 +1601,7 @@ static INLINE void highbd_convolve_2d_sr_horiz_neon(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
+static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
                                                   const int16x8_t x_filter_0_7,
                                                   const int16x4_t x_filter_8_11,
                                                   const int32x4_t shift_s32,
@@ -1626,7 +1626,7 @@ static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
   return vqmovun_s32(sum);
 }
 
-static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
+static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
                                                   const int16x8_t x_filter_0_7,
                                                   const int16x4_t x_filter_8_11,
                                                   const int32x4_t shift_s32,
@@ -1666,7 +1666,7 @@ static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon(
+static inline void highbd_convolve_2d_sr_horiz_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     const int offset) {
diff --git a/av1/common/arm/highbd_convolve_neon.h b/av1/common/arm/highbd_convolve_neon.h
index a32d63e022..72acd0acf3 100644
--- a/av1/common/arm/highbd_convolve_neon.h
+++ b/av1/common/arm/highbd_convolve_neon.h
@@ -18,7 +18,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/convolve.h"
 
-static INLINE int32x4_t highbd_convolve8_4_s32(
+static inline int32x4_t highbd_convolve8_4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -38,7 +38,7 @@ static INLINE int32x4_t highbd_convolve8_4_s32(
   return sum;
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
+static inline uint16x4_t highbd_convolve8_4_sr_s32_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -51,7 +51,7 @@ static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
 }
 
 // Like above but also perform round shifting and subtract correction term
-static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16(
+static inline uint16x4_t highbd_convolve8_4_srsub_s32_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -64,7 +64,7 @@ static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16(
   return vqmovun_s32(sum);
 }
 
-static INLINE void highbd_convolve8_8_s32(
+static inline void highbd_convolve8_8_s32(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -92,7 +92,7 @@ static INLINE void highbd_convolve8_8_s32(
 }
 
 // Like above but also perform round shifting and subtract correction term
-static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16(
+static inline uint16x8_t highbd_convolve8_8_srsub_s32_s16(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -109,7 +109,7 @@ static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16(
   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
-static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
+static inline int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t *filters_lo,
     const int16x4_t *filters_hi, const int32x4_t offset) {
@@ -133,7 +133,7 @@ static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
   return sum;
 }
 
-static INLINE uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+static inline uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t *filters_lo,
     const int16x4_t *filters_hi, const int32x4_t shift_s32,
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
index a51848118a..07e60f2180 100644
--- a/av1/common/arm/highbd_convolve_scale_neon.c
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -23,7 +23,7 @@
 #include "av1/common/filter.h"
 #include "av1/common/arm/highbd_convolve_neon.h"
 
-static INLINE void highbd_dist_wtd_comp_avg_neon(
+static inline void highbd_dist_wtd_comp_avg_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, ConvolveParams *conv_params, const int round_bits,
     const int offset, const int bd) {
@@ -98,7 +98,7 @@ static INLINE void highbd_dist_wtd_comp_avg_neon(
   }
 }
 
-static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+static inline void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
                                         uint16_t *dst_ptr, int dst_stride,
                                         int w, int h,
                                         ConvolveParams *conv_params,
@@ -167,7 +167,7 @@ static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
   }
 }
 
-static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
+static inline void highbd_convolve_2d_x_scale_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int subpel_x_qn, const int x_step_qn,
     const InterpFilterParams *filter_params, ConvolveParams *conv_params,
@@ -368,7 +368,7 @@ static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
   }
 }
 
-static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
+static inline void highbd_convolve_2d_y_scale_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int subpel_y_qn, const int y_step_qn,
     const InterpFilterParams *filter_params, const int round1_bits,
@@ -443,7 +443,7 @@ static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
   }
 }
 
-static INLINE void highbd_convolve_correct_offset_neon(
+static inline void highbd_convolve_correct_offset_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int round_bits, const int offset, const int bd) {
   const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index 5de87d8291..fcf9d7b0a0 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -29,7 +29,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
 };
 
-static INLINE uint16x4_t convolve12_4_x(
+static inline uint16x4_t convolve12_4_x(
     int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
     const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) {
   int16x8_t permuted_samples[6];
@@ -56,7 +56,7 @@ static INLINE uint16x4_t convolve12_4_x(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
+static inline uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
                                         int16x8_t s2, int16x8_t filter_0_7,
                                         int16x8_t filter_4_11, int64x2_t offset,
                                         uint16x8x4_t permute_tbl,
@@ -100,7 +100,7 @@ static INLINE uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_12tap_sve2(
+static inline void highbd_convolve_x_sr_12tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr,
     ConvolveParams *conv_params, int bd) {
@@ -182,7 +182,7 @@ static INLINE void highbd_convolve_x_sr_12tap_sve2(
   }
 }
 
-static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
+static inline uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
                                        int64x2_t offset, uint16x8_t max) {
   int64x2_t sum[8];
   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
@@ -208,7 +208,7 @@ static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_8tap_sve2(
+static inline void highbd_convolve_x_sr_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr,
     ConvolveParams *conv_params, int bd) {
@@ -258,7 +258,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 };
 // clang-format on
 
-static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
+static inline uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
                                        int64x2_t offset,
                                        uint16x8x2_t permute_tbl,
                                        uint16x4_t max) {
@@ -274,7 +274,7 @@ static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
+static inline uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
                                        int64x2_t offset, uint16x8_t tbl,
                                        uint16x8_t max) {
   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
@@ -292,7 +292,7 @@ static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
   return vminq_u16(res, max);
 }
 
-static INLINE void highbd_convolve_x_sr_4tap_sve2(
+static inline void highbd_convolve_x_sr_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr,
     ConvolveParams *conv_params, int bd) {
@@ -399,7 +399,7 @@ void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride,
                                  x_filter_ptr, conv_params, bd);
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
+static inline uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
                                                int16x8_t s2[2],
                                                int16x8_t filter_0_7,
                                                int16x8_t filter_4_11,
@@ -421,7 +421,7 @@ static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
   return vmin_u16(res, max);
 }
 
-static INLINE void highbd_convolve_y_sr_12tap_sve2(
+static inline void highbd_convolve_y_sr_12tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr, int bd) {
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
@@ -518,7 +518,7 @@ static INLINE void highbd_convolve_y_sr_12tap_sve2(
   } while (width != 0);
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
+static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
                                               int16x8_t samples_hi[2],
                                               int16x8_t filter,
                                               uint16x4_t max) {
@@ -535,7 +535,7 @@ static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
+static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
                                               int16x8_t samples_hi[4],
                                               int16x8_t filter,
                                               uint16x8_t max) {
@@ -706,7 +706,7 @@ static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
+static inline uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
                                               int16x8_t filter,
                                               uint16x4_t max) {
   int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
@@ -717,7 +717,7 @@ static INLINE uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
+static inline uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
                                               int16x8_t filter,
                                               uint16x8_t max) {
   int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
@@ -864,7 +864,7 @@ void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride,
                                  y_filter_ptr, bd);
 }
 
-static INLINE uint16x4_t convolve12_4_2d_h(
+static inline uint16x4_t convolve12_4_2d_h(
     int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
     const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) {
   int16x8_t permuted_samples[6];
@@ -890,7 +890,7 @@ static INLINE uint16x4_t convolve12_4_2d_h(
   return vqmovun_s32(sum0123);
 }
 
-static INLINE uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
+static inline uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
                                            int16x8_t s2, int16x8_t filter_0_7,
                                            int16x8_t filter_4_11,
                                            int64x2_t offset, int32x4_t shift,
@@ -934,7 +934,7 @@ static INLINE uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
   return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_12tap_sve2(
+static inline void highbd_convolve_2d_sr_horiz_12tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr,
     ConvolveParams *conv_params, const int x_offset) {
@@ -1017,7 +1017,7 @@ static INLINE void highbd_convolve_2d_sr_horiz_12tap_sve2(
   }
 }
 
-static INLINE uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
+static inline uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
                                           int64x2_t offset, int32x4_t shift) {
   int64x2_t sum[8];
   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
@@ -1043,7 +1043,7 @@ static INLINE uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
   return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_8tap_sve2(
+static inline void highbd_convolve_2d_sr_horiz_8tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr,
     ConvolveParams *conv_params, const int x_offset) {
@@ -1086,7 +1086,7 @@ static INLINE void highbd_convolve_2d_sr_horiz_8tap_sve2(
   } while (height > 0);
 }
 
-static INLINE uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
+static inline uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
                                           int64x2_t offset, int32x4_t shift,
                                           uint16x8x2_t permute_tbl) {
   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
@@ -1100,7 +1100,7 @@ static INLINE uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
   return vqmovun_s32(sum0123);
 }
 
-static INLINE uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
+static inline uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
                                           int64x2_t offset, int32x4_t shift,
                                           uint16x8_t tbl) {
   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
@@ -1118,7 +1118,7 @@ static INLINE uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
   return aom_tbl_u16(res, tbl);
 }
 
-static INLINE void highbd_convolve_2d_sr_horiz_4tap_sve2(
+static inline void highbd_convolve_2d_sr_horiz_4tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *x_filter_ptr,
     ConvolveParams *conv_params, const int x_offset) {
@@ -1185,7 +1185,7 @@ static INLINE void highbd_convolve_2d_sr_horiz_4tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve12_4_2d_v(
+static inline uint16x4_t highbd_convolve12_4_2d_v(
     int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7,
     int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) {
   int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0);
@@ -1204,7 +1204,7 @@ static INLINE uint16x4_t highbd_convolve12_4_2d_v(
   return vmin_u16(res, max);
 }
 
-static INLINE void highbd_convolve_2d_sr_vert_12tap_sve2(
+static inline void highbd_convolve_2d_sr_vert_12tap_sve2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     int width, int height, const int16_t *y_filter_ptr,
     ConvolveParams *conv_params, int bd, const int y_offset) {
@@ -1307,7 +1307,7 @@ static INLINE void highbd_convolve_2d_sr_vert_12tap_sve2(
   } while (width != 0);
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+static inline uint16x4_t highbd_convolve8_4_2d_v(
     int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter,
     int32x4_t shift, int64x2_t offset, uint16x4_t max) {
   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
@@ -1323,7 +1323,7 @@ static INLINE uint16x4_t highbd_convolve8_4_2d_v(
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+static inline uint16x8_t highbd_convolve8_8_2d_v(
     int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter,
     int32x4_t shift, int64x2_t offset, uint16x8_t max) {
   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
@@ -1501,7 +1501,7 @@ static void highbd_convolve_2d_sr_vert_8tap_sve2(
   }
 }
 
-static INLINE uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
+static inline uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
                                                  int16x8_t filter,
                                                  int32x4_t shift,
                                                  int64x2_t offset,
@@ -1516,7 +1516,7 @@ static INLINE uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
   return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
+static inline uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
                                                  int16x8_t filter,
                                                  int32x4_t shift,
                                                  int64x2_t offset,
diff --git a/av1/common/arm/highbd_convolve_sve2.h b/av1/common/arm/highbd_convolve_sve2.h
index 380607716f..abbad14cbb 100644
--- a/av1/common/arm/highbd_convolve_sve2.h
+++ b/av1/common/arm/highbd_convolve_sve2.h
@@ -27,7 +27,7 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
 };
 // clang-format on
 
-static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
+static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
                                         int16x4_t s2, int16x4_t s3,
                                         int16x8_t res[2]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
@@ -53,7 +53,7 @@ static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
   res[1] = vreinterpretq_s16_s32(s0123.val[1]);
 }
 
-static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
+static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
                                         int16x8_t s2, int16x8_t s3,
                                         int16x8_t res[4]) {
   // Transpose 16-bit elements and concatenate result rows as follows:
@@ -80,7 +80,7 @@ static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
   res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
 }
 
-static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
+static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
                                   uint16x8_t tbl, int16x8_t res[4]) {
   res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
   res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
@@ -88,7 +88,7 @@ static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
   res[3] = aom_tbl2_s16(t0[3], t1[3], tbl);
 }
 
-static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
+static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
                                   uint16x8_t tbl, int16x8_t res[2]) {
   res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
   res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index cfaa3e5ca7..70f65101c5 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -51,11 +51,11 @@
   } while (0)
 #endif  // AOM_ARCH_AARCH64
 
-static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
+static inline void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
   TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
 }
 
-static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+static inline void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
   TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
   TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
   TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
@@ -63,7 +63,7 @@ static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
                 out[15]);
 }
 
-static INLINE void round_shift_array_32_neon(int32x4_t *input,
+static inline void round_shift_array_32_neon(int32x4_t *input,
                                              int32x4_t *output, const int size,
                                              const int bit) {
   const int32x4_t v_bit = vdupq_n_s32(-bit);
@@ -72,7 +72,7 @@ static INLINE void round_shift_array_32_neon(int32x4_t *input,
   }
 }
 
-static INLINE void round_shift_rect_array_32_neon(int32x4_t *input,
+static inline void round_shift_rect_array_32_neon(int32x4_t *input,
                                                   int32x4_t *output,
                                                   const int size) {
   for (int i = 0; i < size; i++) {
@@ -81,7 +81,7 @@ static INLINE void round_shift_rect_array_32_neon(int32x4_t *input,
   }
 }
 
-static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
+static inline int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
                                         const int32_t *n1, const int32x4_t *w1,
                                         const int32x4_t *v_bit,
                                         const int32x4_t *rnding) {
@@ -92,7 +92,7 @@ static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
   return x;
 }
 
-static INLINE int32x4_t half_btf_neon_mode11_r(
+static inline int32x4_t half_btf_neon_mode11_r(
     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
   int32x4_t x;
@@ -102,7 +102,7 @@ static INLINE int32x4_t half_btf_neon_mode11_r(
   return x;
 }
 
-static INLINE int32x4_t half_btf_neon_mode01_r(
+static inline int32x4_t half_btf_neon_mode01_r(
     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
   int32x4_t x;
@@ -112,7 +112,7 @@ static INLINE int32x4_t half_btf_neon_mode01_r(
   return x;
 }
 
-static INLINE int32x4_t half_btf_neon_mode10_r(
+static inline int32x4_t half_btf_neon_mode10_r(
     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
   int32x4_t x;
@@ -122,7 +122,7 @@ static INLINE int32x4_t half_btf_neon_mode10_r(
   return x;
 }
 
-static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0,
+static inline int32x4_t half_btf_0_neon_r(const int32_t *n0,
                                           const int32x4_t *w0,
                                           const int32x4_t *v_bit,
                                           const int32x4_t *rnding) {
@@ -132,7 +132,7 @@ static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0,
   return x;
 }
 
-static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
+static inline int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
                                             const int32x4_t *w0,
                                             const int32x4_t *v_bit,
                                             const int32x4_t *rnding) {
@@ -142,7 +142,7 @@ static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
   return x;
 }
 
-static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+static inline void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
   }
@@ -155,7 +155,7 @@ typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
                                   int32_t do_cols, int32_t bd,
                                   int32_t out_shift);
 
-static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
+static inline uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
                                           const uint16x8_t *max) {
   int16x8_t clamped;
   clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
@@ -163,7 +163,7 @@ static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
   return vreinterpretq_u16_s16(clamped);
 }
 
-static INLINE void round_shift_4x4(int32x4_t *in, int shift) {
+static inline void round_shift_4x4(int32x4_t *in, int shift) {
   if (shift != 0) {
     const int32x4_t v_shift = vdupq_n_s32(-shift);
     in[0] = vrshlq_s32(in[0], v_shift);
@@ -213,7 +213,7 @@ static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
   }
 }
 
-static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
+static inline uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
                                                    int32x4_t res0,
                                                    int32x4_t res1,
                                                    const int bd) {
@@ -238,7 +238,7 @@ static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
   return res;
 }
 
-static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
+static inline uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
                                                    int32x4_t res0,
                                                    const int bd) {
   uint16x4_t x0_ = vreinterpret_u16_s16(
@@ -250,7 +250,7 @@ static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
   return vget_low_u16(x0);
 }
 
-static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
+static inline void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
                                                 int stride, int flipud,
                                                 int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -263,7 +263,7 @@ static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
   }
 }
 
-static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
+static inline void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
                                                 int stride, int flipud,
                                                 int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -276,14 +276,14 @@ static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
   }
 }
 
-static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+static inline void load_buffer_32bit_input(const int32_t *in, int stride,
                                            int32x4_t *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = vld1q_s32(in + i * stride);
   }
 }
 
-static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
+static inline void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
   in[0] = vld1q_s32(coeff + 0);
   in[1] = vld1q_s32(coeff + 4);
   in[2] = vld1q_s32(coeff + 8);
@@ -321,7 +321,7 @@ static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
   *in1 = in1_w_offset;
 }
 
-static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
+static inline void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
                                       const int32x4_t *v_bit,
                                       const int32x4_t *rnding) {
   int32x4_t temp1, temp2;
@@ -350,7 +350,7 @@ static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
   bf1[22] = temp2;
 }
 
-static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
+static inline void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -378,7 +378,7 @@ static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
   addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
+static inline void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -417,7 +417,7 @@ static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
   bf1[21] = temp2;
 }
 
-static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
+static inline void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -448,7 +448,7 @@ static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
   addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
+static inline void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -484,7 +484,7 @@ static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
   bf1[23] = temp2;
 }
 
-static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
+static inline void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
                                       const int do_cols, const int bd,
                                       const int out_shift,
                                       const int32x4_t *clamp_lo,
@@ -2841,7 +2841,7 @@ static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
   }
 }
 
-static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
+static inline void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -2907,7 +2907,7 @@ static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
   u[43] = temp4;
 }
 
-static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
+static inline void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
                                       const int32x4_t *clamp_lo,
                                       const int32x4_t *clamp_hi,
                                       const int32x4_t *v_bit,
@@ -2946,7 +2946,7 @@ static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
   }
 }
 
-static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
+static inline void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
                                        const int32x4_t *clamp_lo,
                                        const int32x4_t *clamp_hi,
                                        const int32x4_t *v_bit,
@@ -2998,7 +2998,7 @@ static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
   u[47] = temp4;
 }
 
-static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
+static inline void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
                                        int do_cols, int bd, int out_shift,
                                        const int32x4_t *clamp_lo,
                                        const int32x4_t *clamp_hi) {
@@ -5050,7 +5050,7 @@ DECLARE_ALIGNED(16, static const int16_t *,
   av1_eob_to_eobxy_32x16_default,
 };
 
-static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
+static inline void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
                                                      TX_SIZE tx_size, int eob) {
   if (eob == 1) {
     *eobx = 0;
@@ -5065,7 +5065,7 @@ static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
   *eoby = eobxy >> 8;
 }
 
-static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
                                               TX_SIZE tx_size) {
   if (tx_size == 2) {
     *eoby = 15, *eobx = 15;
@@ -5098,14 +5098,14 @@ static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
   }
 }
 
-static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size) {
   const int txfm_size_row = tx_size_high[tx_size];
   *eoby = AOMMIN(32, txfm_size_row) - 1;
   *eobx = 0;
 }
 
-static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size) {
   const int txfm_size_col = tx_size_wide[tx_size];
   *eobx = AOMMIN(32, txfm_size_col) - 1;
diff --git a/av1/common/arm/highbd_reconinter_neon.c b/av1/common/arm/highbd_reconinter_neon.c
index 10f592f257..ec938c5b25 100644
--- a/av1/common/arm/highbd_reconinter_neon.c
+++ b/av1/common/arm/highbd_reconinter_neon.c
@@ -19,7 +19,7 @@
 #include "aom_ports/mem.h"
 #include "config/av1_rtcd.h"
 
-static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse,
+static inline void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse,
                                             const uint16_t *src0,
                                             int src0_stride,
                                             const uint16_t *src1,
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index b5b85fe693..044a5f80ba 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -18,7 +18,7 @@
 #include "config/av1_rtcd.h"
 
 #define HBD_WIENER_5TAP_HORIZ(name, shift)                              \
-  static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h(              \
+  static inline uint16x8_t name##_wiener_convolve5_8_2d_h(              \
       const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,       \
       const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
       const int32x4_t round_vec, const uint16x8_t im_max_val) {         \
@@ -43,7 +43,7 @@
     return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val);         \
   }                                                                     \
                                                                         \
-  static INLINE void name##_convolve_add_src_5tap_horiz(                \
+  static inline void name##_convolve_add_src_5tap_horiz(                \
       const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
       ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,     \
       const int32x4_t round_vec, const uint16x8_t im_max_val) {         \
@@ -76,7 +76,7 @@ HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
 #undef HBD_WIENER_5TAP_HORIZ
 
 #define HBD_WIENER_7TAP_HORIZ(name, shift)                                     \
-  static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h(                     \
+  static inline uint16x8_t name##_wiener_convolve7_8_2d_h(                     \
       const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,              \
       const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,              \
       const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
@@ -104,7 +104,7 @@ HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
     return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val);                \
   }                                                                            \
                                                                                \
-  static INLINE void name##_convolve_add_src_7tap_horiz(                       \
+  static inline void name##_convolve_add_src_7tap_horiz(                       \
       const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,        \
       ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,            \
       const int32x4_t round_vec, const uint16x8_t im_max_val) {                \
@@ -137,7 +137,7 @@ HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
 #undef HBD_WIENER_7TAP_HORIZ
 
 #define HBD_WIENER_5TAP_VERT(name, shift)                                     \
-  static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v(                    \
+  static inline uint16x8_t name##_wiener_convolve5_8_2d_v(                    \
       const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,             \
       const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,       \
       const int32x4_t round_vec, const uint16x8_t res_max_val) {              \
@@ -167,7 +167,7 @@ HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
     return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val);              \
   }                                                                           \
                                                                               \
-  static INLINE void name##_convolve_add_src_5tap_vert(                       \
+  static inline void name##_convolve_add_src_5tap_vert(                       \
       const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,       \
       ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,           \
       const int32x4_t round_vec, const uint16x8_t res_max_val) {              \
@@ -221,7 +221,7 @@ HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
 #undef HBD_WIENER_5TAP_VERT
 
 #define HBD_WIENER_7TAP_VERT(name, shift)                                      \
-  static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v(                     \
+  static inline uint16x8_t name##_wiener_convolve7_8_2d_v(                     \
       const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,              \
       const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,              \
       const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \
@@ -255,7 +255,7 @@ HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
     return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val);               \
   }                                                                            \
                                                                                \
-  static INLINE void name##_convolve_add_src_7tap_vert(                        \
+  static inline void name##_convolve_add_src_7tap_vert(                        \
       const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,        \
       ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,            \
       const int32x4_t round_vec, const uint16x8_t res_max_val) {               \
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index 898bd5a54e..fea7d1db08 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -18,7 +18,7 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+static inline uint8x8_t scale_filter_8(const uint8x8_t *const s,
                                        const int16x8_t filter) {
   const int16x4_t filter_lo = vget_low_s16(filter);
   const int16x4_t filter_hi = vget_high_s16(filter);
@@ -44,7 +44,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+static inline void scale_plane_2_to_1_phase_0(const uint8_t *src,
                                               const int src_stride,
                                               uint8_t *dst,
                                               const int dst_stride, const int w,
@@ -68,7 +68,7 @@ static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
   } while (--y);
 }
 
-static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+static inline void scale_plane_4_to_1_phase_0(const uint8_t *src,
                                               const int src_stride,
                                               uint8_t *dst,
                                               const int dst_stride, const int w,
@@ -92,7 +92,7 @@ static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
   } while (--y);
 }
 
-static INLINE void scale_plane_bilinear_kernel(
+static inline void scale_plane_bilinear_kernel(
     const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
     const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
     uint8_t *const dst) {
@@ -118,7 +118,7 @@ static INLINE void scale_plane_bilinear_kernel(
   vst1q_u8(dst, d);
 }
 
-static INLINE void scale_plane_2_to_1_bilinear(
+static inline void scale_plane_2_to_1_bilinear(
     const uint8_t *const src, const int src_stride, uint8_t *dst,
     const int dst_stride, const int w, const int h, const int16_t c0,
     const int16_t c1) {
@@ -153,7 +153,7 @@ static INLINE void scale_plane_2_to_1_bilinear(
   } while (--y);
 }
 
-static INLINE void scale_plane_4_to_1_bilinear(
+static inline void scale_plane_4_to_1_bilinear(
     const uint8_t *const src, const int src_stride, uint8_t *dst,
     const int dst_stride, const int w, const int h, const int16_t c0,
     const int16_t c1) {
@@ -415,7 +415,7 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
   } while (x);
 }
 
-static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+static inline uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
                                               const uint8x8_t *const coef) {
   const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
   const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
@@ -727,7 +727,7 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
 }
 
 // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON.
-static INLINE bool has_normative_scaler_neon(const int src_width,
+static inline bool has_normative_scaler_neon(const int src_width,
                                              const int src_height,
                                              const int dst_width,
                                              const int dst_height) {
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index 8597d2426c..213244806e 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -30,7 +30,7 @@
 #define NB_EVEN 5
 #define NB_ODD 4
 
-static INLINE void calc_ab_fast_internal_common(
+static inline void calc_ab_fast_internal_common(
     uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
     uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
     int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
@@ -115,7 +115,7 @@ static INLINE void calc_ab_fast_internal_common(
                 vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
                 vreinterpretq_s32_u32(p3));
 }
-static INLINE void calc_ab_internal_common(
+static inline void calc_ab_internal_common(
     uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
     uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
     uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
@@ -260,7 +260,7 @@ static INLINE void calc_ab_internal_common(
                 vreinterpretq_s32_u32(p7));
 }
 
-static INLINE void boxsum2_square_sum_calc(
+static inline void boxsum2_square_sum_calc(
     int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
     int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
     int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
@@ -294,7 +294,7 @@ static INLINE void boxsum2_square_sum_calc(
   *r3 = vaddq_s32(r789, r1011);
 }
 
-static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+static inline void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
                            int32_t *dst32, int32_t *dst2, const int dst_stride,
                            const int width, const int height) {
   assert(width > 2 * SGRPROJ_BORDER_HORZ);
@@ -472,7 +472,7 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
   }
 }
 
-static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+static inline void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
                                         uint16_t *B16, int32_t *B,
                                         const int buf_stride, const int width,
                                         const int height, const int r,
@@ -525,7 +525,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+static inline void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
                                         uint16_t *B16, int32_t *B,
                                         const int buf_stride, const int width,
                                         const int height, const int bit_depth,
@@ -591,7 +591,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+static inline void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
                                              const int width, const int height,
                                              const int r, const int s,
@@ -644,7 +644,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+static inline void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
                                              const int width, const int height,
                                              const int bit_depth, const int r,
@@ -699,7 +699,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+static inline void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
                            int32_t *dst2, const int dst_stride, const int width,
                            const int height) {
   assert(width > 2 * SGRPROJ_BORDER_HORZ);
@@ -902,7 +902,7 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
   }
 }
 
-static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+static inline int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
   int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
   int32x4_t fours, threes, res;
 
@@ -922,7 +922,7 @@ static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
   return res;
 }
 
-static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+static inline void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
                                      int32x4_t *a0, int32x4_t *a1) {
   uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
   uint16x8_t r0, r1;
@@ -957,7 +957,7 @@ static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
       vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
 }
 
-static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+static inline int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
   int32x4_t xtr, xt, xtl, xbr, xb, xbl;
   int32x4_t fives, sixes, fives_plus_sixes;
 
@@ -976,7 +976,7 @@ static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
       vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
 }
 
-static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+static inline void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
                                                  int32x4_t *a0, int32x4_t *a1) {
   uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
 
@@ -1004,7 +1004,7 @@ static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
       vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
 }
 
-static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+static inline int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
   int32x4_t xl, x, xr;
   int32x4_t fives, sixes, fives_plus_sixes;
 
@@ -1019,7 +1019,7 @@ static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
       vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
 }
 
-static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+static inline void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
                                                 int32x4_t *a1) {
   uint16x8_t xl, x, xr;
   uint16x8_t x0;
@@ -1174,7 +1174,7 @@ static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
   } while (h > 0);
 }
 
-static INLINE int restoration_fast_internal(uint16_t *dgd16, int width,
+static inline int restoration_fast_internal(uint16_t *dgd16, int width,
                                             int height, int dgd_stride,
                                             int32_t *dst, int dst_stride,
                                             int bit_depth, int sgr_params_idx,
@@ -1244,7 +1244,7 @@ static INLINE int restoration_fast_internal(uint16_t *dgd16, int width,
   return 0;
 }
 
-static INLINE int restoration_internal(uint16_t *dgd16, int width, int height,
+static inline int restoration_internal(uint16_t *dgd16, int width, int height,
                                        int dgd_stride, int32_t *dst,
                                        int dst_stride, int bit_depth,
                                        int sgr_params_idx, int radius_idx) {
@@ -1314,7 +1314,7 @@ static INLINE int restoration_internal(uint16_t *dgd16, int width, int height,
   return 0;
 }
 
-static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+static inline void src_convert_u8_to_u16(const uint8_t *src,
                                          const int src_stride, uint16_t *dst,
                                          const int dst_stride, const int width,
                                          const int height) {
@@ -1369,7 +1369,7 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+static inline void src_convert_hbd_copy(const uint16_t *src, int src_stride,
                                         uint16_t *dst, const int dst_stride,
                                         int width, int height) {
   const uint16_t *src_ptr;
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index 0457f66c26..e2887b95d8 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -22,7 +22,7 @@
 #include "av1/common/common.h"
 #include "av1/common/restoration.h"
 
-static INLINE uint16x8_t wiener_convolve5_8_2d_h(
+static inline uint16x8_t wiener_convolve5_8_2d_h(
     const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
     const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter,
     const int32x4_t round_vec, const uint16x8_t im_max_val) {
@@ -47,7 +47,7 @@ static INLINE uint16x8_t wiener_convolve5_8_2d_h(
   return vminq_u16(res, im_max_val);
 }
 
-static INLINE void convolve_add_src_horiz_5tap_neon(
+static inline void convolve_add_src_horiz_5tap_neon(
     const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
     const int32x4_t round_vec, const uint16x8_t im_max_val) {
@@ -74,7 +74,7 @@ static INLINE void convolve_add_src_horiz_5tap_neon(
   } while (--h != 0);
 }
 
-static INLINE uint16x8_t wiener_convolve7_8_2d_h(
+static inline uint16x8_t wiener_convolve7_8_2d_h(
     const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
     const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5,
     const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec,
@@ -102,7 +102,7 @@ static INLINE uint16x8_t wiener_convolve7_8_2d_h(
   return vminq_u16(res, im_max_val);
 }
 
-static INLINE void convolve_add_src_horiz_7tap_neon(
+static inline void convolve_add_src_horiz_7tap_neon(
     const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
     const int32x4_t round_vec, const uint16x8_t im_max_val) {
@@ -129,7 +129,7 @@ static INLINE void convolve_add_src_horiz_7tap_neon(
   } while (--h != 0);
 }
 
-static INLINE uint8x8_t wiener_convolve5_8_2d_v(
+static inline uint8x8_t wiener_convolve5_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,
     const int32x4_t round_vec) {
@@ -152,7 +152,7 @@ static INLINE uint8x8_t wiener_convolve5_8_2d_v(
   return vqmovun_s16(vcombine_s16(res_lo, res_hi));
 }
 
-static INLINE void convolve_add_src_vert_5tap_neon(
+static inline void convolve_add_src_vert_5tap_neon(
     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
     const int32x4_t round_vec) {
@@ -200,7 +200,7 @@ static INLINE void convolve_add_src_vert_5tap_neon(
   } while (w != 0);
 }
 
-static INLINE uint8x8_t wiener_convolve7_8_2d_v(
+static inline uint8x8_t wiener_convolve7_8_2d_v(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) {
@@ -226,7 +226,7 @@ static INLINE uint8x8_t wiener_convolve7_8_2d_v(
   return vqmovun_s16(vcombine_s16(res_lo, res_hi));
 }
 
-static INLINE void convolve_add_src_vert_7tap_neon(
+static inline void convolve_add_src_vert_7tap_neon(
     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
     const int32x4_t round_vec) {
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 2d4ff64042..113b3f1866 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -1088,13 +1088,13 @@ static void unlock_buffer_pool(BufferPool *const pool) {
 #endif
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+static inline YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
   if (cm->ref_frame_map[index] == NULL) return NULL;
   return &cm->ref_frame_map[index]->buf;
 }
 
-static INLINE int get_free_fb(AV1_COMMON *cm) {
+static inline int get_free_fb(AV1_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
@@ -1128,7 +1128,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
   return i;
 }
 
-static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+static inline RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
   // Release the previously-used frame-buffer
   if (cm->cur_frame != NULL) {
     --cm->cur_frame->ref_count;
@@ -1150,7 +1150,7 @@ static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
 
 // Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
 // counts accordingly.
-static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+static inline void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
                                          RefCntBuffer *rhs_ptr) {
   RefCntBuffer *const old_ptr = *lhs_ptr;
   if (old_ptr != NULL) {
@@ -1164,26 +1164,26 @@ static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
   ++rhs_ptr->ref_count;
 }
 
-static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
   return cm->current_frame.frame_type == KEY_FRAME ||
          cm->current_frame.frame_type == INTRA_ONLY_FRAME;
 }
 
-static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+static inline int frame_is_sframe(const AV1_COMMON *cm) {
   return cm->current_frame.frame_type == S_FRAME;
 }
 
 // These functions take a reference frame label between LAST_FRAME and
 // EXTREF_FRAME inclusive.  Note that this is different to the indexing
 // previously used by the frame_refs[] array.
-static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+static inline int get_ref_frame_map_idx(const AV1_COMMON *const cm,
                                         const MV_REFERENCE_FRAME ref_frame) {
   return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
              ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
              : INVALID_IDX;
 }
 
-static INLINE RefCntBuffer *get_ref_frame_buf(
+static inline RefCntBuffer *get_ref_frame_buf(
     const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
   const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
@@ -1191,19 +1191,19 @@ static INLINE RefCntBuffer *get_ref_frame_buf(
 
 // Both const and non-const versions of this function are provided so that it
 // can be used with a const AV1_COMMON if needed.
-static INLINE const struct scale_factors *get_ref_scale_factors_const(
+static inline const struct scale_factors *get_ref_scale_factors_const(
     const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
   const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
   return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
 }
 
-static INLINE struct scale_factors *get_ref_scale_factors(
+static inline struct scale_factors *get_ref_scale_factors(
     AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
   const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
   return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
 }
 
-static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+static inline RefCntBuffer *get_primary_ref_frame_buf(
     const AV1_COMMON *const cm) {
   const int primary_ref_frame = cm->features.primary_ref_frame;
   if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
@@ -1212,7 +1212,7 @@ static INLINE RefCntBuffer *get_primary_ref_frame_buf(
 }
 
 // Returns 1 if this frame might allow mvs from some reference frame.
-static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+static inline int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode &&
          cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
          cm->seq_params->order_hint_info.enable_order_hint &&
@@ -1220,12 +1220,12 @@ static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
 }
 
 // Returns 1 if this frame might use warped_motion
-static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+static inline int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
          cm->seq_params->enable_warped_motion;
 }
 
-static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+static inline void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
   const int buf_rows = buf->mi_rows;
   const int buf_cols = buf->mi_cols;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -1259,11 +1259,11 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
 
 void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
 
-static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+static inline int av1_num_planes(const AV1_COMMON *cm) {
   return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
 }
 
-static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+static inline void av1_init_above_context(CommonContexts *above_contexts,
                                           int num_planes, int tile_row,
                                           MACROBLOCKD *xd) {
   for (int i = 0; i < num_planes; ++i) {
@@ -1273,7 +1273,7 @@ static INLINE void av1_init_above_context(CommonContexts *above_contexts,
   xd->above_txfm_context = above_contexts->txfm[tile_row];
 }
 
-static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
+static inline void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
   const int num_planes = av1_num_planes(cm);
   const CommonQuantParams *const quant_params = &cm->quant_params;
 
@@ -1303,7 +1303,7 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
   cfl_init(&xd->cfl, cm->seq_params);
 }
 
-static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+static inline void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
                                        const int num_planes) {
   int i;
   int row_offset = mi_row;
@@ -1325,12 +1325,12 @@ static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
   }
 }
 
-static INLINE int calc_mi_size(int len) {
+static inline int calc_mi_size(int len) {
   // len is in mi units. Align to a multiple of SBs.
   return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
 }
 
-static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+static inline void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
                                 const int num_planes) {
   int i;
   for (i = 0; i < num_planes; i++) {
@@ -1342,7 +1342,7 @@ static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
   }
 }
 
-static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+static inline void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
                                   int mi_rows, int mi_cols) {
   xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
@@ -1417,7 +1417,7 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
     if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
 }
 
-static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+static inline aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
                                            const MB_MODE_INFO *above_mi,
                                            const MB_MODE_INFO *left_mi) {
   const PREDICTION_MODE above = av1_above_block_mode(above_mi);
@@ -1427,7 +1427,7 @@ static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
   return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
 }
 
-static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+static inline void update_partition_context(MACROBLOCKD *xd, int mi_row,
                                             int mi_col, BLOCK_SIZE subsize,
                                             BLOCK_SIZE bsize) {
   PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
@@ -1440,7 +1440,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
   memset(left_ctx, partition_context_lookup[subsize].left, bh);
 }
 
-static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+static inline int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
                                       int subsampling_x, int subsampling_y) {
   assert(bsize < BLOCK_SIZES_ALL);
   const int bw = mi_size_wide[bsize];
@@ -1450,13 +1450,13 @@ static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
   return ref_pos;
 }
 
-static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+static inline aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
                                             size_t element) {
   assert(cdf != NULL);
   return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
 }
 
-static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+static inline void partition_gather_horz_alike(aom_cdf_prob *out,
                                                const aom_cdf_prob *const in,
                                                BLOCK_SIZE bsize) {
   (void)bsize;
@@ -1471,7 +1471,7 @@ static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
-static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+static inline void partition_gather_vert_alike(aom_cdf_prob *out,
                                                const aom_cdf_prob *const in,
                                                BLOCK_SIZE bsize) {
   (void)bsize;
@@ -1486,7 +1486,7 @@ static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
-static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+static inline void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 BLOCK_SIZE bsize,
                                                 PARTITION_TYPE partition) {
@@ -1525,7 +1525,7 @@ static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
   }
 }
 
-static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+static inline int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
                                           int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
   const PARTITION_CONTEXT *left_ctx =
@@ -1542,7 +1542,7 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
 
 // Return the number of elements in the partition CDF when
 // partitioning the (square) block with luma block size of bsize.
-static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+static inline int partition_cdf_length(BLOCK_SIZE bsize) {
   if (bsize <= BLOCK_8X8)
     return PARTITION_TYPES;
   else if (bsize == BLOCK_128X128)
@@ -1551,7 +1551,7 @@ static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
     return EXT_PARTITION_TYPES;
 }
 
-static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+static inline int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                  int plane) {
   assert(bsize < BLOCK_SIZES_ALL);
   int max_blocks_wide = block_size_wide[bsize];
@@ -1565,7 +1565,7 @@ static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
   return max_blocks_wide >> MI_SIZE_LOG2;
 }
 
-static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+static inline int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                  int plane) {
   int max_blocks_high = block_size_high[bsize];
 
@@ -1578,7 +1578,7 @@ static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
   return max_blocks_high >> MI_SIZE_LOG2;
 }
 
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+static inline void av1_zero_above_context(AV1_COMMON *const cm,
                                           const MACROBLOCKD *xd,
                                           int mi_col_start, int mi_col_end,
                                           const int tile_row) {
@@ -1614,7 +1614,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
          tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
 }
 
-static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+static inline void av1_zero_left_context(MACROBLOCKD *const xd) {
   av1_zero(xd->left_entropy_context);
   av1_zero(xd->left_partition_context);
 
@@ -1622,12 +1622,12 @@ static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
          sizeof(xd->left_txfm_context_buffer));
 }
 
-static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+static inline void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   int i;
   for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
 }
 
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+static inline void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
                                  const MACROBLOCKD *xd) {
   uint8_t bw = tx_size_wide[tx_size];
   uint8_t bh = tx_size_high[tx_size];
@@ -1641,12 +1641,12 @@ static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
   set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 }
 
-static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+static inline int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
                                   int mi_row, int mi_col) {
   return mi_row * mi_params->mi_stride + mi_col;
 }
 
-static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+static inline int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
                                    int mi_row, int mi_col) {
   const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
   const int mi_alloc_row = mi_row / mi_alloc_size_1d;
@@ -1656,7 +1656,7 @@ static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
 }
 
 // For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
-static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+static inline void set_mi_offsets(const CommonModeInfoParams *const mi_params,
                                   MACROBLOCKD *const xd, int mi_row,
                                   int mi_col) {
   // 'mi_grid_base' should point to appropriate memory in 'mi'.
@@ -1670,7 +1670,7 @@ static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
   xd->tx_type_map_stride = mi_params->mi_stride;
 }
 
-static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+static inline void txfm_partition_update(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          TX_SIZE tx_size, TX_SIZE txb_size) {
   BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
@@ -1683,7 +1683,7 @@ static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
   for (i = 0; i < bw; ++i) above_ctx[i] = txw;
 }
 
-static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+static inline TX_SIZE get_sqr_tx_size(int tx_dim) {
   switch (tx_dim) {
     case 128:
     case 64: return TX_64X64; break;
@@ -1694,7 +1694,7 @@ static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
   }
 }
 
-static INLINE TX_SIZE get_tx_size(int width, int height) {
+static inline TX_SIZE get_tx_size(int width, int height) {
   if (width == height) {
     return get_sqr_tx_size(width);
   }
@@ -1733,7 +1733,7 @@ static INLINE TX_SIZE get_tx_size(int width, int height) {
   return TX_4X4;
 }
 
-static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+static inline int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
                                          const TXFM_CONTEXT *const left_ctx,
                                          BLOCK_SIZE bsize, TX_SIZE tx_size) {
   const uint8_t txw = tx_size_wide[tx_size];
@@ -1759,7 +1759,7 @@ static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
 
 // Compute the next partition in the direction of the sb_type stored in the mi
 // array, starting with bsize.
-static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+static inline PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
                                            int mi_row, int mi_col,
                                            BLOCK_SIZE bsize) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -1836,7 +1836,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
   return base_partitions[split_idx];
 }
 
-static INLINE void set_sb_size(SequenceHeader *const seq_params,
+static inline void set_sb_size(SequenceHeader *const seq_params,
                                BLOCK_SIZE sb_size) {
   seq_params->sb_size = sb_size;
   seq_params->mib_size = mi_size_wide[seq_params->sb_size];
@@ -1846,7 +1846,7 @@ static INLINE void set_sb_size(SequenceHeader *const seq_params,
 // Returns true if the frame is fully lossless at the coded resolution.
 // Note: If super-resolution is used, such a frame will still NOT be lossless at
 // the upscaled resolution.
-static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+static inline int is_coded_lossless(const AV1_COMMON *cm,
                                     const MACROBLOCKD *xd) {
   int coded_lossless = 1;
   if (cm->seg.enabled) {
@@ -1862,7 +1862,7 @@ static INLINE int is_coded_lossless(const AV1_COMMON *cm,
   return coded_lossless;
 }
 
-static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+static inline int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
   return seq_level_idx == SEQ_LEVEL_MAX ||
          (seq_level_idx < SEQ_LEVELS &&
           // The following levels are currently undefined.
diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
index e1044d31d1..fd1d32114c 100644
--- a/av1/common/av1_inv_txfm1d.h
+++ b/av1/common/av1_inv_txfm1d.h
@@ -18,14 +18,14 @@
 extern "C" {
 #endif
 
-static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+static inline int32_t clamp_value(int32_t value, int8_t bit) {
   if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
   const int64_t max_value = (1LL << (bit - 1)) - 1;
   const int64_t min_value = -(1LL << (bit - 1));
   return (int32_t)clamp64(value, min_value, max_value);
 }
 
-static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+static inline void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
   for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
 }
 
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index ef9521b2db..fe0afe40fc 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -111,7 +111,7 @@ void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   }
 }
 
-static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+static inline TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_idct4;
     case TXFM_TYPE_DCT8: return av1_idct8;
@@ -231,7 +231,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
   }
 }
 
-static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+static inline void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
                                     int stride, TXFM_2D_FLIP_CFG *cfg,
                                     int32_t *txfm_buf, TX_SIZE tx_size,
                                     int bd) {
@@ -315,7 +315,7 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
   }
 }
 
-static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+static inline void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
                                          int stride, int32_t *txfm_buf,
                                          TX_TYPE tx_type, TX_SIZE tx_size,
                                          int bd) {
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index f406109afd..c3c3123497 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -44,11 +44,11 @@ static const int32_t NewSqrt2 = 5793;
 // 2^12 / sqrt(2)
 static const int32_t NewInvSqrt2 = 2896;
 
-static INLINE const int32_t *cospi_arr(int n) {
+static inline const int32_t *cospi_arr(int n) {
   return av1_cospi_arr_data[n - cos_bit_min];
 }
 
-static INLINE const int32_t *sinpi_arr(int n) {
+static inline const int32_t *sinpi_arr(int n) {
   return av1_sinpi_arr_data[n - cos_bit_min];
 }
 
@@ -62,20 +62,20 @@ extern const int16_t av1_sinpi_arr_q13_data[4][4];
 
 extern const int32_t av1_cospi_arr_s32_data[4][66];
 
-static INLINE const int16_t *cospi_arr_q13(int n) {
+static inline const int16_t *cospi_arr_q13(int n) {
   return av1_cospi_arr_q13_data[n - cos_bit_min];
 }
 
-static INLINE const int16_t *sinpi_arr_q13(int n) {
+static inline const int16_t *sinpi_arr_q13(int n) {
   return av1_sinpi_arr_q13_data[n - cos_bit_min];
 }
 
-static INLINE const int32_t *cospi_arr_s32(int n) {
+static inline const int32_t *cospi_arr_s32(int n) {
   return av1_cospi_arr_s32_data[n - cos_bit_min];
 }
 #endif  // HAVE_NEON
 
-static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+static inline int32_t range_check_value(int32_t value, int8_t bit) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   const int64_t max_value = (1LL << (bit - 1)) - 1;
   const int64_t min_value = -(1LL << (bit - 1));
@@ -94,12 +94,12 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
   return value;
 }
 
-static INLINE int32_t round_shift(int64_t value, int bit) {
+static inline int32_t round_shift(int64_t value, int bit) {
   assert(bit >= 1);
   return (int32_t)((value + (1ll << (bit - 1))) >> bit);
 }
 
-static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+static inline int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
   int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
   int64_t intermediate = result_64 + (1LL << (bit - 1));
@@ -123,7 +123,7 @@ static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
   return (int32_t)(intermediate >> bit);
 }
 
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+static inline uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
   return clip_pixel_highbd(dest + (int)trans, bd);
 }
@@ -166,7 +166,7 @@ typedef struct TXFM_2D_FLIP_CFG {
   int stage_num_row;
 } TXFM_2D_FLIP_CFG;
 
-static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
+static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -206,13 +206,13 @@ static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
   }
 }
 
-static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+static inline void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
   get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
 }
 
 // Utility function that returns the log of the ratio of the col and row
 // sizes.
-static INLINE int get_rect_tx_log_ratio(int col, int row) {
+static inline int get_rect_tx_log_ratio(int col, int row) {
   if (col == row) return 0;
   if (col > row) {
     if (col == row * 2) return 1;
@@ -239,10 +239,10 @@ void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
                           TXFM_2D_FLIP_CFG *cfg);
 extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
 extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
-static INLINE int get_txw_idx(TX_SIZE tx_size) {
+static inline int get_txw_idx(TX_SIZE tx_size) {
   return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
 }
-static INLINE int get_txh_idx(TX_SIZE tx_size) {
+static inline int get_txh_idx(TX_SIZE tx_size) {
   return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
 }
 
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 3b7f21e44f..b95235fb7e 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -62,11 +62,11 @@ enum {
   FRAME_TYPES,
 } UENUM1BYTE(FRAME_TYPE);
 
-static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
+static inline int is_comp_ref_allowed(BLOCK_SIZE bsize) {
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
-static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+static inline int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= INTER_MODE_START && mode < INTER_MODE_END;
 }
 
@@ -75,14 +75,14 @@ typedef struct {
   int stride[MAX_MB_PLANE];
 } BUFFER_SET;
 
-static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+static inline int is_inter_singleref_mode(PREDICTION_MODE mode) {
   return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
 }
-static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+static inline int is_inter_compound_mode(PREDICTION_MODE mode) {
   return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
 }
 
-static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+static inline PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   static const PREDICTION_MODE lut[] = {
     DC_PRED,        // DC_PRED
     V_PRED,         // V_PRED
@@ -115,7 +115,7 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   return lut[mode];
 }
 
-static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+static inline PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
   static const PREDICTION_MODE lut[] = {
     MB_MODE_COUNT,  // DC_PRED
     MB_MODE_COUNT,  // V_PRED
@@ -148,17 +148,17 @@ static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
   return lut[mode];
 }
 
-static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+static inline int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
           mode == NEW_NEARMV);
 }
 
-static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+static inline int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
           mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
 }
 
-static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+static inline int is_masked_compound_type(COMPOUND_TYPE type) {
   return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
 }
 
@@ -342,11 +342,11 @@ typedef struct MB_MODE_INFO {
 
 /*!\cond */
 
-static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
   return mbmi->use_intrabc;
 }
 
-static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
+static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
   assert(mode < UV_INTRA_MODES);
   static const PREDICTION_MODE uv2y[] = {
     DC_PRED,        // UV_DC_PRED
@@ -369,20 +369,20 @@ static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
   return uv2y[mode];
 }
 
-static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
   return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
 }
 
-static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
-static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
+static inline int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
   return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
                                     (mbmi->ref_frame[1] >= BWDREF_FRAME)));
 }
 
-static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
+static inline MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
     LAST_FRAME,     // LAST_LAST2_FRAMES,
     LAST_FRAME,     // LAST_LAST3_FRAMES,
@@ -398,7 +398,7 @@ static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
   return lut[ref_idx];
 }
 
-static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
+static inline MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
     LAST2_FRAME,    // LAST_LAST2_FRAMES,
     LAST3_FRAME,    // LAST_LAST3_FRAMES,
@@ -418,7 +418,7 @@ PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
 
 PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
 
-static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
+static inline int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
   const PREDICTION_MODE mode = mbmi->mode;
   const BLOCK_SIZE bsize = mbmi->bsize;
@@ -429,7 +429,7 @@ static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
 }
 
 #if CONFIG_MISMATCH_DEBUG
-static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+static inline void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
                                    int mi_row, int tx_blk_col, int tx_blk_row,
                                    int subsampling_x, int subsampling_y) {
   *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
@@ -929,7 +929,7 @@ typedef struct macroblockd {
 
 /*!\cond */
 
-static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
 #if CONFIG_AV1_HIGHBITDEPTH
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 #else
@@ -938,7 +938,7 @@ static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
 #endif
 }
 
-static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+static inline uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
 #if CONFIG_AV1_HIGHBITDEPTH
   return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
              ? CONVERT_TO_BYTEPTR(buf16)
@@ -959,7 +959,7 @@ typedef struct BitDepthInfo {
   int use_highbitdepth_buf;
 } BitDepthInfo;
 
-static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
+static inline BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
   BitDepthInfo bit_depth_info;
   bit_depth_info.bit_depth = xd->bd;
   bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd);
@@ -968,7 +968,7 @@ static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
   return bit_depth_info;
 }
 
-static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+static inline int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_4X4: return 0;
     case BLOCK_8X8: return 1;
@@ -987,7 +987,7 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
 // Conversion tables).
 // Note: the input block size should be square.
 // Otherwise it's considered invalid.
-static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+static inline BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
                                                PARTITION_TYPE partition) {
   if (partition == PARTITION_INVALID) {
     return BLOCK_INVALID;
@@ -1022,9 +1022,9 @@ static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
   return _intra_mode_to_tx_type[mode];
 }
 
-static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+static inline int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
 
-static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
+static inline int block_signals_txsize(BLOCK_SIZE bsize) {
   return bsize > BLOCK_4X4;
 }
 
@@ -1094,7 +1094,7 @@ static const TxSetType av1_ext_tx_set_lookup[2][2] = {
   { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
 };
 
-static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+static inline TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                 int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
   if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
@@ -1114,14 +1114,14 @@ static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
     0, 3, -1, -1, 2, 1 },
 };
 
-static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
+static inline int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
                                  int use_reduced_set) {
   const TxSetType set_type =
       av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
   return ext_tx_set_index[is_inter][set_type];
 }
 
-static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
+static inline int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
                                    int use_reduced_set) {
   const int set_type =
       av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
@@ -1131,7 +1131,7 @@ static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
 #define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
 #define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
 
-static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
+static inline TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
   if (bsize == BLOCK_4X4)
@@ -1148,7 +1148,7 @@ static const uint8_t mode_to_angle_map[INTRA_MODES] = {
 
 // Converts block_index for given transform size to index of the block in raster
 // order.
-static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+static inline int av1_block_index_to_raster_order(TX_SIZE tx_size,
                                                   int block_idx) {
   // For transform size 4x8, the possible block_idx values are 0 & 2, because
   // block_idx values are incremented in steps of size 'tx_width_unit x
@@ -1160,14 +1160,14 @@ static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
 
 // Inverse of above function.
 // Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
-static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+static inline int av1_raster_order_to_block_index(TX_SIZE tx_size,
                                                   int raster_order) {
   assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
   // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
   return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
 }
 
-static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+static inline TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
                                           const MACROBLOCKD *xd,
                                           TX_SIZE tx_size,
                                           int use_screen_content_tools) {
@@ -1183,7 +1183,7 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
 
 // Implements the get_plane_residual_size() function in the spec (Section
 // 5.11.38. Get plane residual size function).
-static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+static inline BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
                                               int subsampling_x,
                                               int subsampling_y) {
   assert(bsize < BLOCK_SIZES_ALL);
@@ -1203,7 +1203,7 @@ static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
  * const int bw_uint_log2 = mi_size_wide_log2[bsize];
  * const int stride_log2 = bw_uint_log2 - tx_w_log2;
  */
-static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+static inline int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
   static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
     0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
@@ -1233,7 +1233,7 @@ static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
  * const int bw_uint_log2 = mi_size_wide_log2[bsize];
  * const int stride_log2 = bw_uint_log2 - tx_w_log2;
  */
-static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+static inline int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
   static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
     0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
@@ -1252,7 +1252,7 @@ static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
 }
 #endif  // CONFIG_INSPECTION
 
-static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+static inline void update_txk_array(MACROBLOCKD *const xd, int blk_row,
                                     int blk_col, TX_SIZE tx_size,
                                     TX_TYPE tx_type) {
   const int stride = xd->tx_type_map_stride;
@@ -1275,7 +1275,7 @@ static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
   }
 }
 
-static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+static inline TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
                                       PLANE_TYPE plane_type, int blk_row,
                                       int blk_col, TX_SIZE tx_size,
                                       int reduced_tx_set) {
@@ -1322,7 +1322,7 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
  *   tx_size = sub_tx_size_map[tx_size];
  * }
  */
-static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+static inline int bsize_to_max_depth(BLOCK_SIZE bsize) {
   static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
     0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   };
@@ -1341,7 +1341,7 @@ static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
  * }
  * assert(depth < 10);
  */
-static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+static inline int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
   assert(bsize < BLOCK_SIZES_ALL);
   static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
     0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
@@ -1351,14 +1351,14 @@ static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
   return depth - 1;
 }
 
-static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+static inline TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
   TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
   TX_SIZE tx_size = max_tx_size;
   for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
   return tx_size;
 }
 
-static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
   switch (tx_size) {
     case TX_64X64:
     case TX_64X32:
@@ -1369,7 +1369,7 @@ static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
   }
 }
 
-static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+static inline TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, subsampling_x, subsampling_y);
@@ -1378,7 +1378,7 @@ static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
   return av1_get_adjusted_tx_size(uv_tx);
 }
 
-static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
+static inline TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
   if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
@@ -1405,30 +1405,30 @@ void av1_set_entropy_contexts(const MACROBLOCKD *xd,
                               int has_eob, int aoff, int loff);
 
 #define MAX_INTERINTRA_SB_SQUARE 32 * 32
-static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+static inline int is_interintra_mode(const MB_MODE_INFO *mbmi) {
   return (mbmi->ref_frame[0] > INTRA_FRAME &&
           mbmi->ref_frame[1] == INTRA_FRAME);
 }
 
-static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
   return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
 }
 
-static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
   return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
 }
 
-static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
   return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
 }
 
-static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
   return is_interintra_allowed_bsize(mbmi->bsize) &&
          is_interintra_allowed_mode(mbmi->mode) &&
          is_interintra_allowed_ref(mbmi->ref_frame);
 }
 
-static INLINE int is_interintra_allowed_bsize_group(int group) {
+static inline int is_interintra_allowed_bsize_group(int group) {
   int i;
   for (i = 0; i < BLOCK_SIZES_ALL; i++) {
     if (size_group_lookup[i] == group &&
@@ -1439,12 +1439,12 @@ static INLINE int is_interintra_allowed_bsize_group(int group) {
   return 0;
 }
 
-static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+static inline int is_interintra_pred(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME &&
          mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
 }
 
-static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+static inline int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                        int plane) {
   if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
   const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
@@ -1452,12 +1452,12 @@ static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
   return av1_get_adjusted_tx_size(max_txsize);  // chroma
 }
 
-static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
   assert(bsize < BLOCK_SIZES_ALL);
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
-static INLINE int is_motion_variation_allowed_compound(
+static inline int is_motion_variation_allowed_compound(
     const MB_MODE_INFO *mbmi) {
   return !has_second_ref(mbmi);
 }
@@ -1465,13 +1465,13 @@ static INLINE int is_motion_variation_allowed_compound(
 // input: log2 of length, 0(4), 1(8), ...
 static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
 
-static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+static inline int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
   return mbmi->overlappable_neighbors != 0;
 }
 
-static INLINE MOTION_MODE
-motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
-                    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+static inline MOTION_MODE motion_mode_allowed(
+    const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
   if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
   if (xd->cur_frame_force_integer_mv == 0) {
     const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
@@ -1491,11 +1491,11 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
   return SIMPLE_TRANSLATION;
 }
 
-static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
   return (is_inter_block(mbmi));
 }
 
-static INLINE int av1_allow_palette(int allow_screen_content_tools,
+static inline int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
   assert(sb_type < BLOCK_SIZES_ALL);
   return allow_screen_content_tools &&
@@ -1509,7 +1509,7 @@ static INLINE int av1_allow_palette(int allow_screen_content_tools,
 // differ from 'height' and 'width' when part of the block is outside the
 // right
 // and/or bottom image boundary.
-static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+static inline void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
                                             const MACROBLOCKD *xd, int *width,
                                             int *height,
                                             int *rows_within_bounds,
@@ -1572,7 +1572,7 @@ typedef struct {
   ColorCost color_cost;
 } Av1ColorMapParam;
 
-static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+static inline int is_nontrans_global_motion(const MACROBLOCKD *xd,
                                             const MB_MODE_INFO *mbmi) {
   int ref;
 
@@ -1589,11 +1589,11 @@ static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
   return 1;
 }
 
-static INLINE PLANE_TYPE get_plane_type(int plane) {
+static inline PLANE_TYPE get_plane_type(int plane) {
   return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
 }
 
-static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+static inline int av1_get_max_eob(TX_SIZE tx_size) {
   if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
     return 1024;
   }
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 1f50fc91e0..02de17a46a 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -121,7 +121,7 @@ void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
   }
 }
 
-static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+static inline void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
                              int sstride, int v, int h) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
@@ -249,7 +249,7 @@ static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info,
   }
 }
 
-static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
+static inline void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
                                   uint8_t use_highbitdepth) {
   ptrdiff_t offset =
       (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
@@ -271,7 +271,7 @@ static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
 }
 
 // Initializes block-level parameters for CDEF.
-static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd,
+static inline void cdef_init_fb_col(const MACROBLOCKD *const xd,
                                     CdefBlockInfo *const fb_info, int *level,
                                     int *sec_strength, int fbc, int fbr,
                                     int plane) {
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index b84f861b9d..c7a8a331cf 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -56,9 +56,9 @@ typedef struct {
   int roffset;    /*!< current row offset */
 } CdefBlockInfo;
 
-static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+static inline int sign(int i) { return i < 0 ? -1 : 1; }
 
-static INLINE int constrain(int diff, int threshold, int damping) {
+static inline int constrain(int diff, int threshold, int damping) {
   if (!threshold) return 0;
 
   const int shift = AOMMAX(0, damping - get_msb(threshold));
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index 063d8d3941..ad269c7850 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -286,7 +286,7 @@ void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in,
    edge), so we can apply more deringing. A low variance means that we
    either have a low contrast edge, or a non-directional texture, so
    we want to be careful not to blur. */
-static INLINE int adjust_strength(int strength, int32_t var) {
+static inline int adjust_strength(int strength, int32_t var) {
   const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
   /* We use the variance of 8x8 blocks to adjust the strength. */
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 0ecff38608..6016a67f2b 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -54,7 +54,7 @@ void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
                         cdef_list *dlist, int cdef_count, int level,
                         int sec_strength, int damping, int coeff_shift);
 
-static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+static inline void fill_rect(uint16_t *dst, int dstride, int v, int h,
                              uint16_t x) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 5b04909a35..56c5baa610 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -22,7 +22,7 @@
    This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
    (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
    and const2. */
-static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+static inline v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
                                     v128 const2) {
   v128 tmp;
   /* Reverse partial B. */
@@ -43,7 +43,7 @@ static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
   return partiala;
 }
 
-static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
   v128 t0, t1, t2, t3;
   t0 = v128_ziplo_32(x1, x0);
   t1 = v128_ziplo_32(x3, x2);
@@ -58,7 +58,7 @@ static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
 
 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
    to compute the remaining directions. */
-static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
   v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
   v128 partial6;
   v128 tmp;
@@ -129,7 +129,7 @@ static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
 
 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
    counter-clockwise rotation of the pixels. */
-static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) {
   const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
   const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
   const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
@@ -200,7 +200,7 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
 // Work around compiler out of memory issues with Win32 builds. This issue has
 // been observed with Visual Studio 2017, 2019, and 2022 (version 17.10.3).
 #if defined(_MSC_VER) && defined(_M_IX86)
-#define CDEF_INLINE static INLINE
+#define CDEF_INLINE static inline
 #else
 #define CDEF_INLINE SIMD_INLINE
 #endif
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 652cb4d0f5..7d4f07f863 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -80,7 +80,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
 // Due to frame boundary issues, it is possible that the total area covered by
 // chroma exceeds that of luma. When this happens, we fill the missing pixels by
 // repeating the last columns and/or rows.
-static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
+static inline void cfl_pad(CFL_CTX *cfl, int width, int height) {
   const int diff_width = width - cfl->buf_width;
   const int diff_height = height - cfl->buf_height;
 
@@ -134,7 +134,7 @@ static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
 
 CFL_SUB_AVG_FN(c)
 
-static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
+static inline int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
                                    CFL_PRED_TYPE pred_type) {
   const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
                                                    : CFL_SIGN_V(joint_sign);
@@ -144,7 +144,7 @@ static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
   return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
 }
 
-static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+static inline void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
                                      int dst_stride, int alpha_q3, int width,
                                      int height) {
   for (int j = 0; j < height; j++) {
@@ -159,7 +159,7 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
 CFL_PREDICT_FN(c, lbd)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
+static inline void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
                                      int dst_stride, int alpha_q3,
                                      int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
@@ -299,7 +299,7 @@ static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
 CFL_GET_SUBSAMPLE_FUNCTION(c)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+static inline cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
   if (sub_x == 1) {
     if (sub_y == 1) {
@@ -311,7 +311,7 @@ static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
 }
 #endif
 
-static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+static inline cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
   if (sub_x == 1) {
     if (sub_y == 1) {
@@ -371,7 +371,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
 
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
 // and non-chroma-referenced blocks are stored together in the CfL buffer.
-static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+static inline void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
                                         int mi_col, int *row_out,
                                         int *col_out) {
   // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
@@ -402,7 +402,7 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
   cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
 }
 
-static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+static inline int max_intra_block_width(const MACROBLOCKD *xd,
                                         BLOCK_SIZE plane_bsize, int plane,
                                         TX_SIZE tx_size) {
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
@@ -410,7 +410,7 @@ static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
   return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
 }
 
-static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+static inline int max_intra_block_height(const MACROBLOCKD *xd,
                                          BLOCK_SIZE plane_bsize, int plane,
                                          TX_SIZE tx_size) {
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 8f093b0d96..73c060b101 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -16,7 +16,7 @@
 #include "av1/common/blockd.h"
 
 // Can we use CfL for the current block?
-static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+static inline CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
@@ -35,7 +35,7 @@ static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
 
 // Do we need to save the luma pixels from the current block,
 // for a possible future CfL prediction?
-static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+static inline CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
                                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
 
@@ -56,17 +56,17 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
                             mbmi->uv_mode == UV_CFL_PRED);
 }
 
-static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
+static inline int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
   int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 
-static INLINE CFL_PRED_TYPE get_cfl_pred_type(int plane) {
+static inline CFL_PRED_TYPE get_cfl_pred_type(int plane) {
   assert(plane > 0);
   return (CFL_PRED_TYPE)(plane - 1);
 }
 
-static INLINE void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
+static inline void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
   cfl->use_dc_pred_cache = false;
   cfl->dc_pred_is_cached[CFL_PRED_U] = false;
   cfl->dc_pred_is_cached[CFL_PRED_V] = false;
diff --git a/av1/common/common.h b/av1/common/common.h
index 770cd76920..8bcfed933f 100644
--- a/av1/common/common.h
+++ b/av1/common/common.h
@@ -43,7 +43,7 @@ extern "C" {
 #define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
 #define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
-static INLINE int get_unsigned_bits(unsigned int num_values) {
+static inline int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 92c2e2a0df..46270cd1f9 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1314,14 +1314,14 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
 #define WIENER_MAX_EXT_SIZE 263
 
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   return sum;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+static inline int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
@@ -1329,7 +1329,7 @@ static INLINE int highbd_horz_scalar_product(const uint16_t *a,
 }
 #endif
 
-static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+static inline int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
                                              const int16_t *b) {
   int sum = 0;
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 3d679a718a..d2e4d15fc3 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -65,7 +65,7 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             const int subpel_y_qn, int y_step_q4, int scaled,
                             ConvolveParams *conv_params);
 
-static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
+static inline ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
@@ -99,12 +99,12 @@ static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
   return conv_params;
 }
 
-static INLINE ConvolveParams get_conv_params(int do_average, int plane,
+static inline ConvolveParams get_conv_params(int do_average, int plane,
                                              int bd) {
   return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
 }
 
-static INLINE WienerConvolveParams get_conv_params_wiener(int bd) {
+static inline WienerConvolveParams get_conv_params_wiener(int bd) {
   WienerConvolveParams conv_params;
   conv_params.round_0 = WIENER_ROUND0_BITS;
   conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index f8332d4ce4..868c12b452 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -79,12 +79,12 @@ struct frame_contexts;
 
 typedef char ENTROPY_CONTEXT;
 
-static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+static inline int combine_entropy_contexts(ENTROPY_CONTEXT a,
                                            ENTROPY_CONTEXT b) {
   return (a != 0) + (b != 0);
 }
 
-static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+static inline int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
@@ -170,7 +170,7 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+static inline TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
   return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
                    1);
 }
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index c688b21746..0783af0c9d 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -191,7 +191,7 @@ void av1_setup_frame_contexts(struct AV1Common *cm);
 void av1_setup_past_independence(struct AV1Common *cm);
 
 // Returns (int)ceil(log2(n)).
-static INLINE int av1_ceil_log2(int n) {
+static inline int av1_ceil_log2(int n) {
   if (n < 2) return 0;
   return get_msb(n - 1) + 1;
 }
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index bd4b7e9ece..0880133523 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -37,11 +37,11 @@ enum {
   MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
 } UENUM1BYTE(MV_JOINT_TYPE);
 
-static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+static inline int mv_joint_vertical(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
 }
 
-static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+static inline int mv_joint_horizontal(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
 }
 
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 9814828f4d..752b18a59b 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -76,21 +76,21 @@ typedef union int_interpfilters {
   InterpFilters as_filters;
 } int_interpfilters;
 
-static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+static inline InterpFilter av1_extract_interp_filter(int_interpfilters filters,
                                                      int dir) {
   return (InterpFilter)((dir) ? filters.as_filters.x_filter
                               : filters.as_filters.y_filter);
 }
 
-static INLINE int_interpfilters
-av1_broadcast_interp_filter(InterpFilter filter) {
+static inline int_interpfilters av1_broadcast_interp_filter(
+    InterpFilter filter) {
   int_interpfilters filters;
   filters.as_filters.x_filter = filter;
   filters.as_filters.y_filter = filter;
   return filters;
 }
 
-static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
+static inline InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
 }
 
@@ -244,7 +244,7 @@ static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
   { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
 };
 
-static INLINE const InterpFilterParams *
+static inline const InterpFilterParams *
 av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
                                              const int w) {
   if (w <= 4 && interp_filter != MULTITAP_SHARP2)
@@ -252,7 +252,7 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
   return &av1_interp_filter_params_list[interp_filter];
 }
 
-static INLINE const int16_t *av1_get_interp_filter_kernel(
+static inline const int16_t *av1_get_interp_filter_kernel(
     const InterpFilter interp_filter, int subpel_search) {
   assert(subpel_search >= USE_2_TAPS);
   return (subpel_search == USE_2_TAPS)
@@ -262,12 +262,12 @@ static INLINE const int16_t *av1_get_interp_filter_kernel(
                     : av1_interp_filter_params_list[interp_filter].filter_ptr);
 }
 
-static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+static inline const int16_t *av1_get_interp_filter_subpel_kernel(
     const InterpFilterParams *const filter_params, const int subpel) {
   return filter_params->filter_ptr + filter_params->taps * subpel;
 }
 
-static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
+static inline const InterpFilterParams *av1_get_filter(int subpel_search) {
   assert(subpel_search >= USE_2_TAPS);
 
   switch (subpel_search) {
@@ -278,18 +278,18 @@ static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
   }
 }
 
-static INLINE void reset_interp_filter_allowed_mask(
+static inline void reset_interp_filter_allowed_mask(
     uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
   uint16_t tmp = (~(1 << filt_type)) & 0xffff;
   *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
 }
 
-static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+static inline void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
                                                   DUAL_FILTER_TYPE filt_type) {
   *allow_interp_mask |= (1 << filt_type);
 }
 
-static INLINE uint8_t get_interp_filter_allowed_mask(
+static inline uint8_t get_interp_filter_allowed_mask(
     uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
   return (allow_interp_mask >> filt_type) & 1;
 }
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 799e38a383..c02b70ef15 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -39,7 +39,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
 void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
 
-static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+static inline const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
   return (const int32_t *)input;
 }
diff --git a/av1/common/mv.h b/av1/common/mv.h
index d83d4fa60e..b731bc875d 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -180,23 +180,23 @@ static const WarpedMotionParams default_warp_params = {
 #define GM_TRANS_MIN -GM_TRANS_MAX
 #define GM_ALPHA_MIN -GM_ALPHA_MAX
 
-static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+static inline int block_center_x(int mi_col, BLOCK_SIZE bs) {
   const int bw = block_size_wide[bs];
   return mi_col * MI_SIZE + bw / 2 - 1;
 }
 
-static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+static inline int block_center_y(int mi_row, BLOCK_SIZE bs) {
   const int bh = block_size_high[bs];
   return mi_row * MI_SIZE + bh / 2 - 1;
 }
 
-static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+static inline int convert_to_trans_prec(int allow_hp, int coor) {
   if (allow_hp)
     return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
   else
     return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
 }
-static INLINE void integer_mv_precision(MV *mv) {
+static inline void integer_mv_precision(MV *mv) {
   int mod = (mv->row % 8);
   if (mod != 0) {
     mv->row -= mod;
@@ -228,7 +228,7 @@ static INLINE void integer_mv_precision(MV *mv) {
 // allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
 // is_integer is true, the bottom three bits will be zero (so the motion vector
 // represents an integer)
-static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+static inline int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
                                           int allow_hp, BLOCK_SIZE bsize,
                                           int mi_col, int mi_row,
                                           int is_integer) {
@@ -296,7 +296,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
   return res;
 }
 
-static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
+static inline TransformationType get_wmtype(const WarpedMotionParams *gm) {
   if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
       gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
     return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
@@ -312,20 +312,20 @@ typedef struct candidate_mv {
   int_mv comp_mv;
 } CANDIDATE_MV;
 
-static INLINE int is_zero_mv(const MV *mv) {
+static inline int is_zero_mv(const MV *mv) {
   return *((const uint32_t *)mv) == 0;
 }
 
-static INLINE int is_equal_mv(const MV *a, const MV *b) {
+static inline int is_equal_mv(const MV *a, const MV *b) {
   return *((const uint32_t *)a) == *((const uint32_t *)b);
 }
 
-static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
   mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
   mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
 }
 
-static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+static inline void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
   mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
   mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
 }
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index d78b60124a..b07a9b1a2b 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1065,7 +1065,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
   if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
 }
 
-static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+static inline void record_samples(const MB_MODE_INFO *mbmi, int *pts,
                                   int *pts_inref, int row_offset, int sign_r,
                                   int col_offset, int sign_c) {
   const int bw = block_size_wide[mbmi->bsize];
diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index beaf55c8fd..c24b7aaecd 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h
@@ -34,7 +34,7 @@ typedef struct position {
 // clamp_mv_ref
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
+static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   if (!oh->enable_order_hint) return 0;
 
   const int bits = oh->order_hint_bits_minus_1 + 1;
@@ -49,7 +49,7 @@ static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   return diff;
 }
 
-static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+static inline void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   const SubpelMvLimits mv_limits = {
     xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
     xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
@@ -59,13 +59,13 @@ static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   clamp_mv(mv, &mv_limits);
 }
 
-static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
+static inline int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
   return candidate->mv[which_mv];
 }
 
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+static inline int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
                             const POSITION *mi_pos) {
   return !(mi_row + mi_pos->row < tile->mi_row_start ||
            mi_col + mi_pos->col < tile->mi_col_start ||
@@ -73,19 +73,19 @@ static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
            mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
-static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
+static inline int find_valid_row_offset(const TileInfo *const tile, int mi_row,
                                         int row_offset) {
   return clamp(row_offset, tile->mi_row_start - mi_row,
                tile->mi_row_end - mi_row - 1);
 }
 
-static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
+static inline int find_valid_col_offset(const TileInfo *const tile, int mi_col,
                                         int col_offset) {
   return clamp(col_offset, tile->mi_col_start - mi_col,
                tile->mi_col_end - mi_col - 1);
 }
 
-static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
+static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   if (is_integer) {
     integer_mv_precision(mv);
   } else {
@@ -96,7 +96,7 @@ static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   }
 }
 
-static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+static inline int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
   // Single ref pred
   if (rf[1] <= INTRA_FRAME) return -1;
 
@@ -110,7 +110,7 @@ static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
   return -1;
 }
 
-static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+static inline int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] > INTRA_FRAME) {
     const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
     if (uni_comp_ref_idx >= 0) {
@@ -149,7 +149,7 @@ static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
 };
 // clang-format on
 
-static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+static inline void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
                                      MV_REFERENCE_FRAME ref_frame_type) {
   if (ref_frame_type >= REF_FRAMES) {
     rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
@@ -167,7 +167,7 @@ static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
   { 4, 4, 5, 6, 7 },
 };
 
-static INLINE int16_t av1_mode_context_analyzer(
+static inline int16_t av1_mode_context_analyzer(
     const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
   const int8_t ref_frame = av1_ref_frame_type(rf);
 
@@ -182,7 +182,7 @@ static INLINE int16_t av1_mode_context_analyzer(
   return comp_ctx;
 }
 
-static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+static inline uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
   if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
       ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
     return 0;
@@ -206,7 +206,7 @@ void av1_setup_motion_field(AV1_COMMON *cm);
 void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
                         int lst_map_idx, int gld_map_idx);
 
-static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+static inline void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
   av1_zero(xd->neighbors_ref_counts);
 
   uint8_t *const ref_counts = xd->neighbors_ref_counts;
@@ -262,7 +262,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
 
-static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+static inline void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
                                    int mib_size, int mi_row) {
   if (mi_row - mib_size < tile->mi_row_start) {
     ref_dv->as_fullmv.row = 0;
@@ -274,7 +274,7 @@ static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
   convert_fullmv_to_mv(ref_dv);
 }
 
-static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+static inline int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
                                   const MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BLOCK_SIZE bsize, int mib_size_log2) {
   const int bw = block_size_wide[bsize];
diff --git a/av1/common/obmc.h b/av1/common/obmc.h
index 6a4595c852..db408bd5e7 100644
--- a/av1/common/obmc.h
+++ b/av1/common/obmc.h
@@ -17,7 +17,7 @@ typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
                                           int dir, MB_MODE_INFO *nb_mi,
                                           void *fun_ctxt, const int num_planes);
 
-static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+static inline void foreach_overlappable_nb_above(const AV1_COMMON *cm,
                                                  MACROBLOCKD *xd, int nb_max,
                                                  overlappable_nb_visitor_t fun,
                                                  void *fun_ctxt) {
@@ -54,7 +54,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
   }
 }
 
-static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+static inline void foreach_overlappable_nb_left(const AV1_COMMON *cm,
                                                 MACROBLOCKD *xd, int nb_max,
                                                 overlappable_nb_visitor_t fun,
                                                 void *fun_ctxt) {
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c
index 675d7f3859..36defe04ec 100644
--- a/av1/common/ppc/cfl_ppc.c
+++ b/av1/common/ppc/cfl_ppc.c
@@ -32,7 +32,7 @@ typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
 typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
 typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
 
-static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+static inline void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
                                         int width, int height, int round_offset,
                                         int num_pel_log2) {
   //  int16_t *dst = dst_ptr;
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index 7bc7cc0417..b5d144853a 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -23,7 +23,7 @@
 extern "C" {
 #endif
 
-static INLINE uint8_t get_segment_id(
+static inline uint8_t get_segment_id(
     const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids,
     BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
@@ -45,7 +45,7 @@ static INLINE uint8_t get_segment_id(
   return segment_id;
 }
 
-static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+static inline uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
                                                const MACROBLOCKD *const xd,
                                                int *cdf_index,
                                                int skip_over4x4) {
@@ -90,7 +90,7 @@ static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
   return (prev_ul == prev_u) ? prev_u : prev_l;
 }
 
-static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+static inline uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
@@ -99,7 +99,7 @@ static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   return above_sip + left_sip;
 }
 
-static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+static inline int get_comp_index_context(const AV1_COMMON *cm,
                                          const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
@@ -138,7 +138,7 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm,
   return above_ctx + left_ctx + 3 * offset;
 }
 
-static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+static inline int get_comp_group_idx_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   int above_ctx = 0, left_ctx = 0;
@@ -159,12 +159,12 @@ static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
   return AOMMIN(5, above_ctx + left_ctx);
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
+static inline aom_cdf_prob *av1_get_pred_cdf_seg_id(
     struct segmentation_probs *segp, const MACROBLOCKD *xd) {
   return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
 }
 
-static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+static inline int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
@@ -172,7 +172,7 @@ static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
   return above_skip_mode + left_skip_mode;
 }
 
-static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
+static inline int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
@@ -189,12 +189,12 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
 int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache);
 
-static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+static inline int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
   assert(bsize < BLOCK_SIZES_ALL);
   return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
 }
 
-static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+static inline int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   int ctx = 0;
@@ -207,11 +207,11 @@ int av1_get_intra_inter_context(const MACROBLOCKD *xd);
 
 int av1_get_reference_mode_context(const MACROBLOCKD *xd);
 
-static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+static inline aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
   return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
 
-static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) {
+static inline aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) {
   return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)];
 }
 
@@ -225,25 +225,25 @@ int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
 
 int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
+static inline aom_cdf_prob *av1_get_comp_reference_type_cdf(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_comp_reference_type_context(xd);
   return xd->tile_ctx->comp_ref_type_cdf[pred_context];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
+static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
   return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
+static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
   return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
+static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
   return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
@@ -261,30 +261,30 @@ int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
 
 int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_comp_ref_p(xd);
   return xd->tile_ctx->comp_ref_cdf[pred_context][0];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
   return xd->tile_ctx->comp_ref_cdf[pred_context][1];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
+static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
   return xd->tile_ctx->comp_ref_cdf[pred_context][2];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
+static inline aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
 }
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
+static inline aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
@@ -304,32 +304,32 @@ int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
 
 int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
 
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
 }
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
 }
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
 }
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
 }
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
 }
-static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
+static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
     const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
@@ -339,7 +339,7 @@ static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
-static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+static inline int get_tx_size_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index dc033b139c..6a295844b0 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -251,7 +251,7 @@ const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
 
 // Returns true if the tx_type corresponds to non-identity transform in both
 // horizontal and vertical directions.
-static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+static inline bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
 
 const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
                                  const MACROBLOCKD *xd, int plane,
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index 347b02bcf5..1cd48c6c59 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h
@@ -54,7 +54,7 @@ bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
 
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
-static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+static inline int aom_get_qmlevel(int qindex, int first, int last) {
   return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
 }
 
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 418ed75ba5..262fec72e9 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -778,7 +778,7 @@ const uint8_t *av1_get_obmc_mask(int length) {
   }
 }
 
-static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+static inline void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
                                      int rel_mi_col, uint8_t op_mi_size,
                                      int dir, MB_MODE_INFO *mi, void *fun_ctxt,
                                      const int num_planes) {
@@ -841,7 +841,7 @@ struct obmc_inter_pred_ctxt {
   int *adjacent_stride;
 };
 
-static INLINE void build_obmc_inter_pred_above(
+static inline void build_obmc_inter_pred_above(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
   (void)above_mi;
@@ -880,7 +880,7 @@ static INLINE void build_obmc_inter_pred_above(
   }
 }
 
-static INLINE void build_obmc_inter_pred_left(
+static inline void build_obmc_inter_pred_left(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
   (void)left_mi;
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index d7a4d12c07..b19f1f0635 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -234,11 +234,11 @@ void av1_init_warp_params(InterPredParams *inter_pred_params,
                           const WarpTypesAllowed *warp_types, int ref,
                           const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
 
-static INLINE int has_scale(int xs, int ys) {
+static inline int has_scale(int xs, int ys) {
   return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
 }
 
-static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+static inline void revert_scale_extra_bits(SubpelParams *sp) {
   sp->subpel_x >>= SCALE_EXTRA_BITS;
   sp->subpel_y >>= SCALE_EXTRA_BITS;
   sp->xs >>= SCALE_EXTRA_BITS;
@@ -249,7 +249,7 @@ static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
   assert(sp->ys <= SUBPEL_SHIFTS);
 }
 
-static INLINE void inter_predictor(
+static inline void inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const SubpelParams *subpel_params, int w, int h,
     ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
@@ -269,7 +269,7 @@ static INLINE void inter_predictor(
   }
 }
 
-static INLINE void highbd_inter_predictor(
+static inline void highbd_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const SubpelParams *subpel_params, int w, int h,
     ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2],
@@ -294,7 +294,7 @@ void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
 int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
                                const struct macroblockd_plane *pd, int dir);
 
-static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+static inline int is_interinter_compound_used(COMPOUND_TYPE type,
                                               BLOCK_SIZE sb_type) {
   const int comp_allowed = is_comp_ref_allowed(sb_type);
   switch (type) {
@@ -307,7 +307,7 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
   }
 }
 
-static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+static inline int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   COMPOUND_TYPE comp_type;
   int i;
   if (!is_comp_ref_allowed(sb_type)) return 0;
@@ -320,11 +320,11 @@ static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   return 0;
 }
 
-static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+static inline int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
   return av1_wedge_params_lookup[sb_type].wedge_types;
 }
 
-static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) {
   return av1_wedge_params_lookup[sb_type].wedge_types > 0;
 }
 
@@ -338,7 +338,7 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
                                      const SubpelParams *subpel_params);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
-static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+static inline MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
                                            const MV *src_mv, int bw, int bh,
                                            int ss_x, int ss_y) {
   // If the MV points so far into the UMV border that no visible pixels
@@ -364,7 +364,7 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+static inline int64_t scaled_buffer_offset(int x_offset, int y_offset,
                                            int stride,
                                            const struct scale_factors *sf) {
   int x, y;
@@ -381,7 +381,7 @@ static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
   return (int64_t)y * stride + x;
 }
 
-static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+static inline void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
                                     uint8_t *src, int width, int height,
                                     int stride, int mi_row, int mi_col,
                                     const struct scale_factors *scale,
@@ -409,13 +409,13 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf, const int num_planes);
 
-static INLINE void set_default_interp_filters(
+static inline void set_default_interp_filters(
     MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
   mbmi->interp_filters =
       av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
 }
 
-static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+static inline int av1_is_interp_needed(const MACROBLOCKD *const xd) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   if (mbmi->skip_mode) return 0;
   if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
@@ -451,7 +451,7 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_init_wedge_masks(void);
 
-static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+static inline const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
                                                           int8_t wedge_sign,
                                                           BLOCK_SIZE sb_type) {
   return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 3fd1860011..58a7ca8047 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1622,7 +1622,7 @@ static void highbd_build_non_directional_intra_predictors(
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+static inline BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   assert(subsampling_x >= 0 && subsampling_x < 2);
   assert(subsampling_y >= 0 && subsampling_y < 2);
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index 80c15d03e0..0910a68bd6 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -48,31 +48,31 @@ static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
 
 #define FILTER_INTRA_SCALE_BITS 4
 
-static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+static inline int av1_is_directional_mode(PREDICTION_MODE mode) {
   return mode >= V_PRED && mode <= D67_PRED;
 }
 
-static INLINE int av1_is_diagonal_mode(PREDICTION_MODE mode) {
+static inline int av1_is_diagonal_mode(PREDICTION_MODE mode) {
   return mode >= D45_PRED && mode <= D67_PRED;
 }
 
-static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
+static inline int av1_use_angle_delta(BLOCK_SIZE bsize) {
   return bsize >= BLOCK_8X8;
 }
 
-static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
          cm->features.allow_intrabc;
 }
 
-static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+static inline int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
                                                  BLOCK_SIZE bs) {
   if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
 
   return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }
 
-static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+static inline int av1_filter_intra_allowed(const AV1_COMMON *const cm,
                                            const MB_MODE_INFO *mbmi) {
   return mbmi->mode == DC_PRED &&
          mbmi->palette_mode_info.palette_size[0] == 0 &&
@@ -119,7 +119,7 @@ static const int16_t dr_intra_derivative[90] = {
 // If angle > 0 && angle < 90, dx = -((int)(256 / t));
 // If angle > 90 && angle < 180, dx = (int)(256 / t);
 // If angle > 180 && angle < 270, dx = 1;
-static INLINE int av1_get_dx(int angle) {
+static inline int av1_get_dx(int angle) {
   if (angle > 0 && angle < 90) {
     return dr_intra_derivative[angle];
   } else if (angle > 90 && angle < 180) {
@@ -134,7 +134,7 @@ static INLINE int av1_get_dx(int angle) {
 // If angle > 0 && angle < 90, dy = 1;
 // If angle > 90 && angle < 180, dy = (int)(256 * t);
 // If angle > 180 && angle < 270, dy = -((int)(256 * t));
-static INLINE int av1_get_dy(int angle) {
+static inline int av1_get_dy(int angle) {
   if (angle > 90 && angle < 180) {
     return dr_intra_derivative[angle - 90];
   } else if (angle > 180 && angle < 270) {
@@ -145,7 +145,7 @@ static INLINE int av1_get_dy(int angle) {
   }
 }
 
-static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+static inline int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
                                               int type) {
   const int d = abs(delta);
   const int blk_wh = bs0 + bs1;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 25551a5fc3..c8939bf786 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -107,7 +107,7 @@ void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
 bool should_resize_by_half(int height, int width, int height2, int width2);
 
 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
-static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   // So, the following check is more accurate.
@@ -124,7 +124,7 @@ static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
 //
 // Use the non-normative scaler av1_resize_and_extend_frame_nonnormative()
 // for other scaling ratios.
-static INLINE bool av1_has_optimized_scaler(const int src_width,
+static inline bool av1_has_optimized_scaler(const int src_width,
                                             const int src_height,
                                             const int dst_width,
                                             const int dst_height) {
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 1603984779..a73190e1dc 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -294,12 +294,12 @@ typedef struct {
 
 /*!\cond */
 
-static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+static inline void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
   sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
   sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
 }
 
-static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+static inline void set_default_wiener(WienerInfo *wiener_info) {
   wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
   wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
   wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
diff --git a/av1/common/scale.h b/av1/common/scale.h
index bc8326c844..e35d54d0ee 100644
--- a/av1/common/scale.h
+++ b/av1/common/scale.h
@@ -33,7 +33,7 @@ struct scale_factors {
 };
 
 // Note: Expect val to be in q4 precision
-static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) {
+static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   const int off =
       (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
   const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
@@ -42,7 +42,7 @@ static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) {
 }
 
 // Note: Expect val to be in q4 precision
-static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) {
+static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   const int off =
       (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
   const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
@@ -51,7 +51,7 @@ static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) {
 }
 
 // Note: Expect val to be in q4 precision
-static INLINE int av1_unscaled_value(int val, const struct scale_factors *sf) {
+static inline int av1_unscaled_value(int val, const struct scale_factors *sf) {
   (void)sf;
   return val * (1 << SCALE_EXTRA_BITS);
 }
@@ -61,20 +61,20 @@ MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
 
-static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   assert(sf != NULL);
   return sf->x_scale_fp != REF_INVALID_SCALE &&
          sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+static inline int av1_is_scaled(const struct scale_factors *sf) {
   assert(sf != NULL);
   return av1_is_valid_scale(sf) &&
          (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }
 
 // See AV1 spec, Section 6.8.6. Frame size with refs semantics.
-static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+static inline int valid_ref_frame_size(int ref_width, int ref_height,
                                        int this_width, int this_height) {
   return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
          this_width <= 16 * ref_width && this_height <= 16 * ref_height;
diff --git a/av1/common/scan.h b/av1/common/scan.h
index ee6375a2d4..fd428d3083 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -38,12 +38,12 @@ extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
-static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+static inline const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
                                                  TX_TYPE tx_type) {
   return &av1_scan_orders[tx_size][tx_type];
 }
 
-static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+static inline const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
   return get_default_scan(tx_size, tx_type);
 }
 
diff --git a/av1/common/seg_common.h b/av1/common/seg_common.h
index 1ad1ae2213..aa7cd68a9a 100644
--- a/av1/common/seg_common.h
+++ b/av1/common/seg_common.h
@@ -58,13 +58,13 @@ struct segmentation_probs {
                                    [CDF_SIZE(MAX_SEGMENTS)];
 };
 
-static INLINE int segfeature_active(const struct segmentation *seg,
+static inline int segfeature_active(const struct segmentation *seg,
                                     uint8_t segment_id,
                                     SEG_LVL_FEATURES feature_id) {
   return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
 }
 
-static INLINE void segfeatures_copy(struct segmentation *dst,
+static inline void segfeatures_copy(struct segmentation *dst,
                                     const struct segmentation *src) {
   int i, j;
   for (i = 0; i < MAX_SEGMENTS; i++) {
@@ -91,7 +91,7 @@ int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
 void av1_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data);
 
-static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+static inline int get_segdata(const struct segmentation *seg, int segment_id,
                               SEG_LVL_FEATURES feature_id) {
   return seg->feature_data[segment_id][feature_id];
 }
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 6b12bb7708..d25ba08447 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -29,7 +29,7 @@
 #include "av1/common/restoration.h"
 
 // Set up nsync by width.
-static INLINE int get_sync_range(int width) {
+static inline int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
   // video, using 4 gives best performance.
   if (width < 640)
@@ -42,7 +42,7 @@ static INLINE int get_sync_range(int width) {
     return 8;
 }
 
-static INLINE int get_lr_sync_range(int width) {
+static inline int get_lr_sync_range(int width) {
 #if 0
   // nsync numbers are picked by testing. For example, for 4k
   // video, using 4 gives best performance.
@@ -170,7 +170,7 @@ void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
+static inline void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
                                          int row) {
   if (!row) return;
 #if CONFIG_MULTITHREAD
@@ -186,7 +186,7 @@ static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
+static inline void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
                                           int row) {
 #if CONFIG_MULTITHREAD
   AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
@@ -200,7 +200,7 @@ static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+static inline void sync_read(AV1LfSync *const lf_sync, int r, int c,
                              int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
@@ -222,7 +222,7 @@ static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+static inline void sync_write(AV1LfSync *const lf_sync, int r, int c,
                               const int sb_cols, int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
@@ -519,7 +519,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   }
 }
 
-static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+static inline void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
 #if CONFIG_MULTITHREAD
   AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
   const int nsync = loop_res_sync->sync_range;
@@ -541,7 +541,7 @@ static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+static inline void lr_sync_write(void *const lr_sync, int r, int c,
                                  const int sb_cols, int plane) {
 #if CONFIG_MULTITHREAD
   AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index e1f6104eb2..2a1008c6bb 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -47,30 +47,30 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = {
   TX_CLASS_HORIZ,  // H_FLIPADST
 };
 
-static INLINE int get_txb_bhl(TX_SIZE tx_size) {
+static inline int get_txb_bhl(TX_SIZE tx_size) {
   tx_size = av1_get_adjusted_tx_size(tx_size);
   return tx_size_high_log2[tx_size];
 }
 
-static INLINE int get_txb_wide(TX_SIZE tx_size) {
+static inline int get_txb_wide(TX_SIZE tx_size) {
   tx_size = av1_get_adjusted_tx_size(tx_size);
   return tx_size_wide[tx_size];
 }
 
-static INLINE int get_txb_high(TX_SIZE tx_size) {
+static inline int get_txb_high(TX_SIZE tx_size) {
   tx_size = av1_get_adjusted_tx_size(tx_size);
   return tx_size_high[tx_size];
 }
 
-static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int height) {
+static inline uint8_t *set_levels(uint8_t *const levels_buf, const int height) {
   return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
 }
 
-static INLINE int get_padded_idx(const int idx, const int bhl) {
+static inline int get_padded_idx(const int idx, const int bhl) {
   return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2);
 }
 
-static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+static inline int get_br_ctx_2d(const uint8_t *const levels,
                                 const int c,  // raster order
                                 const int bhl) {
   assert(c > 0);
@@ -226,14 +226,14 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
 typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
 typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
 
-static INLINE int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) {
+static inline int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) {
   if (scan_idx == 0) return 0;
   if (scan_idx <= (width << bhl) / 8) return 1;
   if (scan_idx <= (width << bhl) / 4) return 2;
   return 3;
 }
 
-static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+static inline int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
                                           int bhl, TX_SIZE tx_size) {
   assert(coeff_idx > 0);
   int mag;
@@ -257,7 +257,7 @@ static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
   return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
 }
 
-static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+static inline int get_lower_levels_ctx_general(int is_last, int scan_idx,
                                                int bhl, int width,
                                                const uint8_t *levels,
                                                int coeff_idx, TX_SIZE tx_size,
@@ -271,7 +271,7 @@ static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
   return get_lower_levels_ctx(levels, coeff_idx, bhl, tx_size, tx_class);
 }
 
-static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+static inline void set_dc_sign(int *cul_level, int dc_val) {
   if (dc_val < 0)
     *cul_level |= 1 << COEFF_CONTEXT_BITS;
   else if (dc_val > 0)
@@ -443,7 +443,7 @@ SPECIALIZE_GET_TXB_CTX(32, 32)
 
 // Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_*
 // so that the compiler can compile away the while loops.
-static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+static inline void get_txb_ctx(const BLOCK_SIZE plane_bsize,
                                const TX_SIZE tx_size, const int plane,
                                const ENTROPY_CONTEXT *const a,
                                const ENTROPY_CONTEXT *const l,
diff --git a/av1/common/x86/av1_inv_txfm_avx2.c b/av1/common/x86/av1_inv_txfm_avx2.c
index 0639fb481f..ad9a4b76ba 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/av1/common/x86/av1_inv_txfm_avx2.c
@@ -24,7 +24,7 @@
 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
                                           4 * 5793 };
 
-static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
@@ -38,7 +38,7 @@ static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
 }
 
-static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
@@ -50,7 +50,7 @@ static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
 }
 
-static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
   btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
   btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
   btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
@@ -221,7 +221,7 @@ static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
   output[15] = x1[0];
 }
 
-static INLINE void iadst16_stage3_avx2(__m256i *x) {
+static inline void iadst16_stage3_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[0], &x[8]);
   btf_16_adds_subs_avx2(&x[1], &x[9]);
   btf_16_adds_subs_avx2(&x[2], &x[10]);
@@ -232,7 +232,7 @@ static INLINE void iadst16_stage3_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[7], &x[15]);
 }
 
-static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
                                        const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
@@ -246,7 +246,7 @@ static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
 }
 
-static INLINE void iadst16_stage5_avx2(__m256i *x) {
+static inline void iadst16_stage5_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[0], &x[4]);
   btf_16_adds_subs_avx2(&x[1], &x[5]);
   btf_16_adds_subs_avx2(&x[2], &x[6]);
@@ -257,7 +257,7 @@ static INLINE void iadst16_stage5_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[11], &x[15]);
 }
 
-static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
                                        const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
@@ -268,7 +268,7 @@ static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
 }
 
-static INLINE void iadst16_stage7_avx2(__m256i *x) {
+static inline void iadst16_stage7_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[0], &x[2]);
   btf_16_adds_subs_avx2(&x[1], &x[3]);
   btf_16_adds_subs_avx2(&x[4], &x[6]);
@@ -279,7 +279,7 @@ static INLINE void iadst16_stage7_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[13], &x[15]);
 }
 
-static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
                                        const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
@@ -289,7 +289,7 @@ static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
 }
 
-static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
   const __m256i __zero = _mm256_setzero_si256();
   output[0] = x1[0];
   output[1] = _mm256_subs_epi16(__zero, x1[8]);
@@ -462,7 +462,7 @@ static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
   iadst16_stage9_avx2(output, x1);
 }
 
-static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+static inline void idct32_high16_stage3_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[16], &x[17]);
   btf_16_adds_subs_avx2(&x[19], &x[18]);
   btf_16_adds_subs_avx2(&x[20], &x[21]);
@@ -473,7 +473,7 @@ static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
   btf_16_adds_subs_avx2(&x[31], &x[30]);
 }
 
-static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -487,7 +487,7 @@ static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
 }
 
-static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
@@ -504,7 +504,7 @@ static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x[30], &x[29]);
 }
 
-static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
@@ -522,7 +522,7 @@ static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
 }
 
-static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
@@ -542,7 +542,7 @@ static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x[28], &x[27]);
 }
 
-static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
@@ -560,7 +560,7 @@ static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
 }
 
-static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
   btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
   btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
@@ -882,7 +882,7 @@ static void idct32_avx2(const __m256i *input, __m256i *output) {
   idct32_stage9_avx2(output, x1);
 }
 
-static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
@@ -907,7 +907,7 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
 }
 
-static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
@@ -938,7 +938,7 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x[62], &x[61]);
 }
 
-static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
@@ -957,7 +957,7 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
 }
 
-static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   btf_16_adds_subs_avx2(&x[16], &x[19]);
   btf_16_adds_subs_avx2(&x[17], &x[18]);
@@ -970,7 +970,7 @@ static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
   idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
 }
 
-static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -998,7 +998,7 @@ static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x[60], &x[59]);
 }
 
-static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
                                              const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1022,7 +1022,7 @@ static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
 }
 
-static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
@@ -1057,7 +1057,7 @@ static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
   btf_16_adds_subs_avx2(&x[56], &x[55]);
 }
 
-static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
                                        const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
@@ -1088,7 +1088,7 @@ static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
 }
 
-static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
   btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
   btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
@@ -1629,7 +1629,7 @@ static const transform_1d_avx2
     };
 
 // only process w >= 16 h >= 16
-static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+static inline void lowbd_inv_txfm2d_add_no_identity_avx2(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   __m256i buf1[64 * 16];
@@ -1698,7 +1698,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
   }
 }
 
-static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
                                            int stride, int shift, int height,
                                            int txw_idx, int rect_type) {
   const int32_t *input_row = input;
@@ -1737,7 +1737,7 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
   }
 }
 
-static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride,
                                            __m256i *buf, int shift, int height,
                                            int txh_idx) {
   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
@@ -1762,7 +1762,7 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
                                                   uint8_t *output, int stride,
                                                   TX_SIZE tx_size,
                                                   int32_t eob) {
@@ -1789,7 +1789,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+static inline void lowbd_inv_txfm2d_add_h_identity_avx2(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   int eobx, eoby;
@@ -1833,7 +1833,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+static inline void lowbd_inv_txfm2d_add_v_identity_avx2(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   __m256i buf1[64];
@@ -1892,7 +1892,7 @@ static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
   { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
 };
 
-static INLINE void load_buffer_avx2(const int32_t *in, int stride,
+static inline void load_buffer_avx2(const int32_t *in, int stride,
                                     __m128i *out) {
   const __m256i a = _mm256_load_si256((const __m256i *)in);
   const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
@@ -1931,7 +1931,7 @@ static INLINE void load_buffer_avx2(const int32_t *in, int stride,
   out[7] = _mm256_extractf128_si256(gh, 1);
 }
 
-static INLINE void round_and_transpose_avx2(const __m128i *const in,
+static inline void round_and_transpose_avx2(const __m128i *const in,
                                             __m128i *const out, int bit,
                                             int *lr_flip) {
   __m256i buf_temp[4];
@@ -2009,7 +2009,7 @@ static INLINE void round_and_transpose_avx2(const __m128i *const in,
   out[7] = _mm256_extracti128_si256(reg_11, 1);
 }
 
-static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
+static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
                                                        uint8_t *output,
                                                        int stride, int flipud) {
   __m256i in_256[4], v_256[4];
@@ -2107,7 +2107,7 @@ static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
 
 // AVX2 implementation has the advantage when combined multiple operations
 // together.
-static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2(
+static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   __m128i buf1[8];
@@ -2167,7 +2167,7 @@ static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
 }
 
 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
-static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+static inline void lowbd_inv_txfm2d_add_universe_avx2(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   (void)eob;
diff --git a/av1/common/x86/av1_inv_txfm_avx2.h b/av1/common/x86/av1_inv_txfm_avx2.h
index 6f02149a4f..0be45e6c5f 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/av1/common/x86/av1_inv_txfm_avx2.h
@@ -35,7 +35,7 @@ extern "C" {
     out1 = _mm256_mulhrs_epi16(_in, _w1);          \
   } while (0)
 
-static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+static inline void round_shift_avx2(const __m256i *input, __m256i *output,
                                     int size) {
   const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
   for (int i = 0; i < size; ++i) {
@@ -43,7 +43,7 @@ static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
   }
 }
 
-static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+static inline void write_recon_w16_avx2(__m256i res, uint8_t *output) {
   __m128i pred = _mm_loadu_si128((__m128i const *)(output));
   __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
   __m128i y = _mm256_castsi256_si128(
@@ -51,7 +51,7 @@ static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
   _mm_storeu_si128((__m128i *)(output), y);
 }
 
-static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+static inline void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
                                                 int stride, int flipud,
                                                 int height) {
   int j = flipud ? (height - 1) : 0;
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index ee40e3586c..45aa403e9f 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -195,7 +195,7 @@ static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
 }
 
-static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
                                       const __m128i __rounding,
                                       int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -209,7 +209,7 @@ static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[14], x[13]);
 }
 
-static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
                                       const __m128i __rounding,
                                       int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -222,7 +222,7 @@ static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
 }
 
-static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
   btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
   btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
@@ -472,7 +472,7 @@ static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
   idct16_stage7_sse2(output, x);
 }
 
-static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+static inline void idct32_high16_stage3_sse2(__m128i *x) {
   btf_16_adds_subs_sse2(x[16], x[17]);
   btf_16_subs_adds_sse2(x[19], x[18]);
   btf_16_adds_subs_sse2(x[20], x[21]);
@@ -483,7 +483,7 @@ static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
   btf_16_subs_adds_sse2(x[31], x[30]);
 }
 
-static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
@@ -498,7 +498,7 @@ static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
 }
 
-static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
@@ -516,7 +516,7 @@ static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[30], x[29]);
 }
 
-static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -535,7 +535,7 @@ static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
 }
 
-static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
                                       const __m128i __rounding,
                                       int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -556,7 +556,7 @@ static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[28], x[27]);
 }
 
-static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
                                       const __m128i __rounding,
                                       int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -575,7 +575,7 @@ static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
 }
 
-static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
   btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
   btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
@@ -885,7 +885,7 @@ static void idct32_sse2(const __m128i *input, __m128i *output) {
   idct32_stage9_sse2(output, x);
 }
 
-static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
@@ -910,7 +910,7 @@ static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
 }
 
-static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
@@ -941,7 +941,7 @@ static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[62], x[61]);
 }
 
-static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
@@ -960,7 +960,7 @@ static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
 }
 
-static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   btf_16_adds_subs_sse2(x[16], x[19]);
@@ -974,7 +974,7 @@ static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
   idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
 }
 
-static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
@@ -1002,7 +1002,7 @@ static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[60], x[59]);
 }
 
-static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
                                              const __m128i __rounding,
                                              int8_t cos_bit) {
   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
@@ -1026,7 +1026,7 @@ static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
 }
 
-static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
                                       const __m128i __rounding,
                                       int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -1061,7 +1061,7 @@ static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
   btf_16_subs_adds_sse2(x[56], x[55]);
 }
 
-static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+static inline void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
                                        const __m128i __rounding,
                                        int8_t cos_bit) {
   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
@@ -1092,7 +1092,7 @@ static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
 }
 
-static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+static inline void idct64_stage11_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
   btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
   btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
@@ -1880,7 +1880,7 @@ static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+static inline void iadst16_stage3_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[0], x[8]);
   btf_16_adds_subs_sse2(x[1], x[9]);
   btf_16_adds_subs_sse2(x[2], x[10]);
@@ -1891,7 +1891,7 @@ static INLINE void iadst16_stage3_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[7], x[15]);
 }
 
-static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+static inline void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
                                         const __m128i __rounding,
                                         int8_t cos_bit) {
   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
@@ -1906,7 +1906,7 @@ static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
 }
 
-static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+static inline void iadst16_stage5_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[0], x[4]);
   btf_16_adds_subs_sse2(x[1], x[5]);
   btf_16_adds_subs_sse2(x[2], x[6]);
@@ -1917,7 +1917,7 @@ static INLINE void iadst16_stage5_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[11], x[15]);
 }
 
-static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+static inline void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
                                         const __m128i __rounding,
                                         int8_t cos_bit) {
   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
@@ -1929,7 +1929,7 @@ static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
 }
 
-static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+static inline void iadst16_stage7_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[0], x[2]);
   btf_16_adds_subs_sse2(x[1], x[3]);
   btf_16_adds_subs_sse2(x[4], x[6]);
@@ -1940,7 +1940,7 @@ static INLINE void iadst16_stage7_ssse3(__m128i *x) {
   btf_16_adds_subs_sse2(x[13], x[15]);
 }
 
-static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+static inline void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
                                         const __m128i __rounding,
                                         int8_t cos_bit) {
   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
@@ -1951,7 +1951,7 @@ static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
 }
 
-static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
   const __m128i __zero = _mm_setzero_si128();
   output[0] = x[0];
   output[1] = _mm_subs_epi16(__zero, x[8]);
@@ -2232,14 +2232,14 @@ static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
   }
 }
 
-static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+static inline __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
                                                __m128i res) {
   const __m128i zero = _mm_setzero_si128();
   __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
   return _mm_packus_epi16(x0, x0);
 }
 
-static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+static inline void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
                                                int stride, int flipud,
                                                const int height) {
   int j = flipud ? (height - 1) : 0;
@@ -2253,7 +2253,7 @@ static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
   }
 }
 
-static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+static inline void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
                                                int stride, int flipud,
                                                const int height) {
   int j = flipud ? (height - 1) : 0;
@@ -2313,7 +2313,7 @@ static const transform_1d_ssse3
       { NULL, NULL, NULL },
     };
 
-static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+static inline void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
                                            int stride, int shift, int height,
                                            int txw_idx, int rect_type) {
   const int32_t *input_row = input;
@@ -2352,7 +2352,7 @@ static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
   }
 }
 
-static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+static inline void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
                                            __m128i *buf, int shift, int height,
                                            int txh_idx) {
   const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
@@ -2441,7 +2441,7 @@ static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
-static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+static inline __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
                                                  __m128i res0, __m128i res1) {
   const __m128i zero = _mm_setzero_si128();
   __m128i x0 = _mm_unpacklo_epi8(pred, zero);
@@ -2451,7 +2451,7 @@ static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
   return _mm_packus_epi16(x0, x1);
 }
 
-static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+static inline void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
                                                 int stride, int flipud,
                                                 int height) {
   int j = flipud ? (height - 1) : 0;
@@ -2463,7 +2463,7 @@ static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
   }
 }
 
-static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+static inline void round_shift_ssse3(const __m128i *input, __m128i *output,
                                      int size) {
   const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
   for (int i = 0; i < size; ++i) {
@@ -2471,7 +2471,7 @@ static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+static inline void lowbd_inv_txfm2d_add_no_identity_ssse3(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   __m128i buf1[64 * 8];
@@ -2642,7 +2642,7 @@ void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
 }
 
 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
-static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+static inline void lowbd_inv_txfm2d_add_universe_ssse3(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   switch (tx_type) {
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.h b/av1/common/x86/av1_inv_txfm_ssse3.h
index b6ffc392d5..888fcc5734 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -57,7 +57,7 @@ extern "C" {
     out1 = _mm_subs_epi16(_in0, _in1);                  \
   } while (0)
 
-static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
   if (bit < 0) {
     const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
     for (int i = 0; i < size; ++i) {
@@ -178,7 +178,7 @@ static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
   2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
 };
 
-static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
                                               TX_SIZE tx_size, int eob) {
   if (eob == 1) {
     *eobx = 0;
@@ -198,7 +198,7 @@ static int eob_fill[32] = {
   31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
 };
 
-static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size, int eob) {
   eob -= 1;
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -209,7 +209,7 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
   *eoby = eob_fill[temp_eoby];
 }
 
-static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
                                                  TX_SIZE tx_size, int eob) {
   eob -= 1;
   const int txfm_size_row = tx_size_high[tx_size];
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index 8e4cba502c..dca0d88913 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -25,7 +25,7 @@
 extern "C" {
 #endif
 
-static INLINE void btf_16_w4_sse2(
+static inline void btf_16_w4_sse2(
     const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
     const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
     __m128i *const out0, __m128i *const out1) {
@@ -80,29 +80,29 @@ static INLINE void btf_16_w4_sse2(
     out1 = _mm_packs_epi32(d0, d1);               \
   } while (0)
 
-static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+static inline __m128i load_16bit_to_16bit(const int16_t *a) {
   return _mm_load_si128((const __m128i *)a);
 }
 
-static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+static inline __m128i load_32bit_to_16bit(const int32_t *a) {
   const __m128i a_low = _mm_load_si128((const __m128i *)a);
   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 }
 
-static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
   const __m128i a_low = _mm_load_si128((const __m128i *)a);
   return _mm_packs_epi32(a_low, a_low);
 }
 
 // Store 4 16 bit values. Sign extend the values.
-static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
   _mm_store_si128((__m128i *)b, a_1);
 }
 
 // Store 8 16 bit values. Sign extend the values.
-static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+static inline void store_16bit_to_32bit(__m128i a, int32_t *b) {
   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
   const __m128i a_hi = _mm_unpackhi_epi16(a, a);
   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
@@ -111,13 +111,13 @@ static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
   _mm_store_si128((__m128i *)(b + 4), a_2);
 }
 
-static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+static inline __m128i scale_round_sse2(const __m128i a, const int scale) {
   const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
   const __m128i b = _mm_madd_epi16(a, scale_rounding);
   return _mm_srai_epi32(b, NewSqrt2Bits);
 }
 
-static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+static inline void store_rect_16bit_to_32bit_w4(const __m128i a,
                                                 int32_t *const b) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
@@ -125,7 +125,7 @@ static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
   _mm_store_si128((__m128i *)b, b_lo);
 }
 
-static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+static inline void store_rect_16bit_to_32bit(const __m128i a,
                                              int32_t *const b) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
@@ -136,7 +136,7 @@ static INLINE void store_rect_16bit_to_32bit(const __m128i a,
   _mm_store_si128((__m128i *)(b + 4), b_hi);
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
                                                  const int stride,
                                                  __m128i *const out,
                                                  const int out_size) {
@@ -145,7 +145,7 @@ static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
                                                       const int stride,
                                                       __m128i *const out,
                                                       const int out_size) {
@@ -154,14 +154,14 @@ static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
                                               __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_16bit_to_16bit(in + i * stride);
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in,
                                                    int stride, __m128i *out,
                                                    int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -169,21 +169,21 @@ static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
   }
 }
 
-static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
                                               __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_32bit_to_16bit(in + i * stride);
   }
 }
 
-static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
                                                  __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_32bit_to_16bit_w4(in + i * stride);
   }
 }
 
-static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in,
                                                    int stride, __m128i *out,
                                                    int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -191,7 +191,7 @@ static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
                                                   int32_t *const out,
                                                   const int stride,
                                                   const int out_size) {
@@ -200,7 +200,7 @@ static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
                                                   int32_t *const out,
                                                   const int stride,
                                                   const int out_size) {
@@ -209,7 +209,7 @@ static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
   }
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
                                                        int32_t *const out,
                                                        const int stride,
                                                        const int out_size) {
@@ -218,7 +218,7 @@ static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
   }
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
                                                        int32_t *const out,
                                                        const int stride,
                                                        const int out_size) {
@@ -227,7 +227,7 @@ static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
   }
 }
 
-static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
                                                    uint16_t *out,
                                                    const int stride) {
   for (int i = 0; i < 8; ++i) {
@@ -235,7 +235,7 @@ static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
   }
 }
 
-static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+static inline void round_shift_16bit(__m128i *in, int size, int bit) {
   if (bit < 0) {
     bit = -bit;
     __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
@@ -250,7 +250,7 @@ static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
   }
 }
 
-static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
   }
diff --git a/av1/common/x86/av1_txfm_sse4.h b/av1/common/x86/av1_txfm_sse4.h
index bee03b40b5..10f5121174 100644
--- a/av1/common/x86/av1_txfm_sse4.h
+++ b/av1/common/x86/av1_txfm_sse4.h
@@ -18,14 +18,14 @@
 extern "C" {
 #endif
 
-static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+static inline __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
   __m128i tmp, round;
   round = _mm_set1_epi32(1 << (bit - 1));
   tmp = _mm_add_epi32(vec, round);
   return _mm_srai_epi32(tmp, bit);
 }
 
-static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input,
+static inline void av1_round_shift_array_32_sse4_1(const __m128i *input,
                                                    __m128i *output,
                                                    const int size,
                                                    const int bit) {
@@ -42,7 +42,7 @@ static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input,
   }
 }
 
-static INLINE void av1_round_shift_rect_array_32_sse4_1(const __m128i *input,
+static inline void av1_round_shift_rect_array_32_sse4_1(const __m128i *input,
                                                         __m128i *output,
                                                         const int size,
                                                         const int bit,
diff --git a/av1/common/x86/cdef_block_avx2.c b/av1/common/x86/cdef_block_avx2.c
index 7a2aa11e21..a74b39612c 100644
--- a/av1/common/x86/cdef_block_avx2.c
+++ b/av1/common/x86/cdef_block_avx2.c
@@ -24,7 +24,7 @@ const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
 are in const1 and const2. */
-static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
+static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala,
                                             __m256i *partialb,
                                             const __m256i *const1,
                                             const __m256i *const2) {
@@ -49,7 +49,7 @@ static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
   return *partiala;
 }
 
-static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
+static inline __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
                                  __m256i *x3) {
   const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
   const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
@@ -66,7 +66,7 @@ static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
 
 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
 to compute the remaining directions. */
-static INLINE __m256i compute_directions_avx2(__m256i *lines,
+static inline __m256i compute_directions_avx2(__m256i *lines,
                                               int32_t cost_frist_8x8[4],
                                               int32_t cost_second_8x8[4]) {
   __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
@@ -148,7 +148,7 @@ static INLINE __m256i compute_directions_avx2(__m256i *lines,
 
 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
 counter-clockwise rotation of the pixels. */
-static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
+static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
   const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
   const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
   const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index b2b6cae933..4ee0b220ca 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -241,7 +241,7 @@ static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+static inline __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
                                         __m256i alpha_sign, __m256i dc_q0) {
   __m256i ac_q3 = _mm256_loadu_si256(input);
   __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
@@ -251,7 +251,7 @@ static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
   return _mm256_add_epi16(scaled_luma_q0, dc_q0);
 }
 
-static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+static inline void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
                                         uint8_t *dst, int dst_stride,
                                         int alpha_q3, int width, int height) {
   (void)width;
@@ -313,7 +313,7 @@ static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
   return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
 }
 
-static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+static inline void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
                                         uint16_t *dst, int dst_stride,
                                         int alpha_q3, int bd, int width,
                                         int height) {
@@ -379,7 +379,7 @@ cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
 
 // Returns a vector where all the (32-bits) elements are the sum of all the
 // lanes in a.
-static INLINE __m256i fill_sum_epi32(__m256i a) {
+static inline __m256i fill_sum_epi32(__m256i a) {
   // Given that a == [A, B, C, D, E, F, G, H]
   a = _mm256_hadd_epi32(a, a);
   // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
@@ -394,12 +394,12 @@ static INLINE __m256i fill_sum_epi32(__m256i a) {
   // a == [A''', A''', A''', A''', A''', A''', A''', A''']
 }
 
-static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+static inline __m256i _mm256_addl_epi16(__m256i a) {
   return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
                           _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
 }
 
-static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+static inline void subtract_average_avx2(const uint16_t *src_ptr,
                                          int16_t *dst_ptr, int width,
                                          int height, int round_offset,
                                          int num_pel_log2) {
diff --git a/av1/common/x86/cfl_sse2.c b/av1/common/x86/cfl_sse2.c
index d5e90aba01..aff5725cbf 100644
--- a/av1/common/x86/cfl_sse2.c
+++ b/av1/common/x86/cfl_sse2.c
@@ -14,12 +14,12 @@
 #include "av1/common/cfl.h"
 #include "config/av1_rtcd.h"
 
-static INLINE __m128i fill_sum_epi32(__m128i l0) {
+static inline __m128i fill_sum_epi32(__m128i l0) {
   l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
   return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
 }
 
-static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+static inline void subtract_average_sse2(const uint16_t *src_ptr,
                                          int16_t *dst_ptr, int width,
                                          int height, int round_offset,
                                          int num_pel_log2) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 1339c24498..5441441beb 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -18,12 +18,12 @@
 #include "av1/common/x86/cfl_simd.h"
 
 // Load 32-bit integer from memory into the first element of dst.
-static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+static inline __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
   return _mm_cvtsi32_si128(*((int *)mem_addr));
 }
 
 // Store 32-bit integer from the first element of a into memory.
-static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+static inline void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
   *((int *)mem_addr) = _mm_cvtsi128_si32(a);
 }
 
@@ -37,7 +37,7 @@ static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+static inline void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -92,7 +92,7 @@ static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+static inline void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -132,7 +132,7 @@ static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+static inline void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -179,7 +179,7 @@ static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+static inline void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -234,7 +234,7 @@ static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+static inline void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -267,7 +267,7 @@ static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
   } while (pred_buf_m128i < end);
 }
 
-static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+static inline void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
                                                       int width, int height) {
@@ -301,7 +301,7 @@ static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
 
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
-static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+static inline __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
                                         __m128i alpha_sign, __m128i dc_q0) {
   __m128i ac_q3 = _mm_loadu_si128(input);
   __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
@@ -310,7 +310,7 @@ static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
   return _mm_add_epi16(scaled_luma_q0, dc_q0);
 }
 
-static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+static inline void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
                                          uint8_t *dst, int dst_stride,
                                          int alpha_q3, int width, int height) {
   const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
@@ -344,17 +344,17 @@ static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
 CFL_PREDICT_FN(ssse3, lbd)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE __m128i highbd_max_epi16(int bd) {
+static inline __m128i highbd_max_epi16(int bd) {
   const __m128i neg_one = _mm_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
   return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
 }
 
-static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+static inline __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
   return _mm_max_epi16(_mm_min_epi16(u, max), zero);
 }
 
-static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+static inline void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
                                          uint16_t *dst, int dst_stride,
                                          int alpha_q3, int bd, int width,
                                          int height) {
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index c64f7259ab..2b5f162fb6 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "av1/common/convolve.h"
 
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {
   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
@@ -36,7 +36,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
   coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
 }
 
-static INLINE __m128i convolve(const __m128i *const s,
+static inline __m128i convolve(const __m128i *const s,
                                const __m128i *const coeffs) {
   const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
   const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
@@ -46,7 +46,7 @@ static INLINE __m128i convolve(const __m128i *const s,
   return d;
 }
 
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
+static inline __m128i convolve_lo_x(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
@@ -56,7 +56,7 @@ static INLINE __m128i convolve_lo_x(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
+static inline __m128i convolve_lo_y(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
@@ -66,7 +66,7 @@ static INLINE __m128i convolve_lo_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
+static inline __m128i convolve_hi_y(const __m128i *const s,
                                     const __m128i *const coeffs) {
   __m128i ss[4];
   ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
index bb247499d0..b54aeb6f96 100644
--- a/av1/common/x86/filterintra_sse4.c
+++ b/av1/common/x86/filterintra_sse4.c
@@ -28,7 +28,7 @@
 
 // Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
 // at zero to preserve the sum.
-static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
+static inline void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
                                      const __m128i *pixels,
                                      const __m128i *taps_0_1,
                                      const __m128i *taps_2_3,
@@ -56,7 +56,7 @@ static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
 // 4xH transform sizes are given special treatment because xx_loadl_64 goes out
 // of bounds and every block involves the left column. This implementation
 // loads TL from the top row for the first block, so it is not
-static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
+static inline void filter_4xh(uint8_t *dest, ptrdiff_t stride,
                               const uint8_t *const top_ptr,
                               const uint8_t *const left_ptr, int mode,
                               const int height) {
@@ -208,7 +208,7 @@ static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
   }
 }
 
-static INLINE void filter_intra_predictor_sse4_1(void *const dest,
+static inline void filter_intra_predictor_sse4_1(void *const dest,
                                                  ptrdiff_t stride,
                                                  const void *const top_row,
                                                  const void *const left_column,
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 61a0e01524..e8595b73c3 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -31,7 +31,7 @@
 //   ... ...
 //   v124, v125, v126, v127
 
-static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
   const __m256i zero = _mm256_setzero_si256();
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
@@ -47,7 +47,7 @@ static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
   return clamped;
 }
 
-static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
   if (shift != 0) {
     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
     in[0] = _mm256_add_epi32(in[0], rnding);
@@ -62,7 +62,7 @@ static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
   }
 }
 
-static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
   round_shift_4x4_avx2(in, shift);
   round_shift_4x4_avx2(in + 4, shift);
   round_shift_4x4_avx2(in + 8, shift);
@@ -88,7 +88,7 @@ static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
   }
 }
 
-static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
                                                  __m256i res0, __m256i res1,
                                                  const int bd) {
   __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
@@ -102,7 +102,7 @@ static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
   return x0;
 }
 
-static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
                                                  int stride, int flipud,
                                                  int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -114,7 +114,7 @@ static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
     _mm256_storeu_si256((__m256i *)(output + i * stride), u);
   }
 }
-static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
                                                 const int bd) {
   __m256i x0 = pred;
   x0 = _mm256_add_epi32(res, x0);
@@ -124,7 +124,7 @@ static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
   return x0;
 }
 
-static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
                                                 int stride, int flipud,
                                                 int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -231,14 +231,14 @@ static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
   out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
 
-static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+static inline void load_buffer_32bit_input(const int32_t *in, int stride,
                                            __m256i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
   }
 }
 
-static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
                                       const __m256i *rounding, int bit) {
   __m256i x;
   x = _mm256_mullo_epi32(*w0, *n0);
@@ -247,7 +247,7 @@ static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
   return x;
 }
 
-static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
                                     const __m256i *w1, const __m256i *n1,
                                     const __m256i *rounding, int bit) {
   __m256i x, y;
@@ -275,7 +275,7 @@ static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
   *out1 = a1;
 }
 
-static INLINE void idct32_stage4_avx2(
+static inline void idct32_stage4_avx2(
     __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
     const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
     const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
@@ -298,7 +298,7 @@ static INLINE void idct32_stage4_avx2(
   bf1[22] = temp2;
 }
 
-static INLINE void idct32_stage5_avx2(
+static inline void idct32_stage5_avx2(
     __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
     const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
     const __m256i *clamp_hi, const __m256i *rounding, int bit) {
@@ -321,7 +321,7 @@ static INLINE void idct32_stage5_avx2(
   addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage6_avx2(
+static inline void idct32_stage6_avx2(
     __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
@@ -350,7 +350,7 @@ static INLINE void idct32_stage6_avx2(
   bf1[21] = temp2;
 }
 
-static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
                                       const __m256i *cospi32,
                                       const __m256i *clamp_lo,
                                       const __m256i *clamp_hi,
@@ -378,7 +378,7 @@ static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
   addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
                                       const __m256i *cospi32,
                                       const __m256i *clamp_lo,
                                       const __m256i *clamp_hi,
@@ -407,7 +407,7 @@ static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
   bf1[23] = temp2;
 }
 
-static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
                                       const int do_cols, const int bd,
                                       const int out_shift,
                                       const __m256i *clamp_lo,
@@ -2818,7 +2818,7 @@ static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                    out_shift);
   }
 }
-static INLINE void idct64_stage8_avx2(
+static inline void idct64_stage8_avx2(
     __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
@@ -2864,7 +2864,7 @@ static INLINE void idct64_stage8_avx2(
   u[43] = temp4;
 }
 
-static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
                                       const __m256i *cospi32,
                                       const __m256i *clamp_lo,
                                       const __m256i *clamp_hi,
@@ -2896,7 +2896,7 @@ static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
   }
 }
 
-static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
                                        const __m256i *cospi32,
                                        const __m256i *clamp_lo,
                                        const __m256i *clamp_hi,
@@ -2933,7 +2933,7 @@ static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
   u[47] = temp4;
 }
 
-static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
                                        int bd, int out_shift,
                                        const __m256i *clamp_lo,
                                        const __m256i *clamp_hi) {
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index ac959420c0..dc269c7bcd 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -21,7 +21,7 @@
 #include "av1/common/x86/av1_txfm_sse4.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
-static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+static inline __m128i highbd_clamp_epi16(__m128i u, int bd) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
@@ -37,7 +37,7 @@ static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
   return clamped;
 }
 
-static INLINE void round_shift_4x4(__m128i *in, int shift) {
+static inline void round_shift_4x4(__m128i *in, int shift) {
   if (shift != 0) {
     __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
     in[0] = _mm_add_epi32(in[0], rnding);
@@ -78,7 +78,7 @@ static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
   }
 }
 
-static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+static inline __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
                                                   __m128i res0, __m128i res1,
                                                   const int bd) {
   __m128i x0 = _mm_cvtepi16_epi32(pred);
@@ -95,7 +95,7 @@ static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
   return x0;
 }
 
-static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+static inline __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
                                                   __m128i res0, const int bd) {
   __m128i x0 = _mm_cvtepi16_epi32(pred);
 
@@ -105,7 +105,7 @@ static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
   return x0;
 }
 
-static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+static inline void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
                                                   int stride, int flipud,
                                                   int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -118,7 +118,7 @@ static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
   }
 }
 
-static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+static inline void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
                                                   int stride, int flipud,
                                                   int height, const int bd) {
   int j = flipud ? (height - 1) : 0;
@@ -131,14 +131,14 @@ static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
   }
 }
 
-static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+static inline void load_buffer_32bit_input(const int32_t *in, int stride,
                                            __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
   }
 }
 
-static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
@@ -248,7 +248,7 @@ static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
   *in1 = in1_w_offset;
 }
 
-static INLINE void idct32_stage4_sse4_1(
+static inline void idct32_stage4_sse4_1(
     __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
     const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
     const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
@@ -275,7 +275,7 @@ static INLINE void idct32_stage4_sse4_1(
   bf1[22] = temp2;
 }
 
-static INLINE void idct32_stage5_sse4_1(
+static inline void idct32_stage5_sse4_1(
     __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
     const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
     const __m128i *clamp_hi, const __m128i *rounding, int bit) {
@@ -300,7 +300,7 @@ static INLINE void idct32_stage5_sse4_1(
   addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage6_sse4_1(
+static inline void idct32_stage6_sse4_1(
     __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -335,7 +335,7 @@ static INLINE void idct32_stage6_sse4_1(
   bf1[21] = temp2;
 }
 
-static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+static inline void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
                                         const __m128i *cospi32,
                                         const __m128i *clamp_lo,
                                         const __m128i *clamp_hi,
@@ -365,7 +365,7 @@ static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
   addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
 }
 
-static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+static inline void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
                                         const __m128i *cospi32,
                                         const __m128i *clamp_lo,
                                         const __m128i *clamp_hi,
@@ -398,7 +398,7 @@ static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
   bf1[23] = temp2;
 }
 
-static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+static inline void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
                                         const int do_cols, const int bd,
                                         const int out_shift,
                                         const __m128i *clamp_lo,
@@ -3153,7 +3153,7 @@ static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
   }
 }
-static INLINE void idct64_stage8_sse4_1(
+static inline void idct64_stage8_sse4_1(
     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -3200,7 +3200,7 @@ static INLINE void idct64_stage8_sse4_1(
   u[43] = temp4;
 }
 
-static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+static inline void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
                                         const __m128i *cospi32,
                                         const __m128i *clamp_lo,
                                         const __m128i *clamp_hi,
@@ -3232,7 +3232,7 @@ static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
   }
 }
 
-static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+static inline void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
                                          const __m128i *cospi32,
                                          const __m128i *clamp_lo,
                                          const __m128i *clamp_hi,
@@ -3269,7 +3269,7 @@ static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
   u[47] = temp4;
 }
 
-static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+static inline void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
                                          int bd, int out_shift,
                                          const __m128i *clamp_lo,
                                          const __m128i *clamp_hi) {
diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h
index 54a35a9f62..20b947c3a5 100644
--- a/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -27,7 +27,7 @@
     y3 = _mm_unpackhi_epi64(u1, u3);                  \
   } while (0)
 
-static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+static inline void transpose_8x8(const __m128i *in, __m128i *out) {
   TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
   TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
   TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
@@ -35,7 +35,7 @@ static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
                 out[15]);
 }
 
-static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+static inline void transpose_16x16(const __m128i *in, __m128i *out) {
   // Upper left 8x8
   TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
   TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
@@ -75,7 +75,7 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
                 out[63]);
 }
 
-static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+static inline void transpose_8nx8n(const __m128i *input, __m128i *output,
                                    const int width, const int height) {
   const int numcol = height >> 2;
   const int numrow = width >> 2;
@@ -95,7 +95,7 @@ static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
 
 // Note:
 //  rounding = 1 << (bit - 1)
-static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+static inline __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
                                       const __m128i *w1, const __m128i *n1,
                                       const __m128i *rounding, int bit) {
   __m128i x, y;
@@ -108,7 +108,7 @@ static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
   return x;
 }
 
-static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+static inline __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
                                         const __m128i *rounding, int bit) {
   __m128i x;
 
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 5f7bf04675..0d6fb6136c 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -32,7 +32,7 @@ static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
                                                          14, 15, 12, 13, 14, 15,
                                                          12, 13, 14, 15 };
 
-static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+static inline void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
                                                           __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 =
@@ -91,7 +91,7 @@ static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
   coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
 }
 
-static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+static inline void highbd_prepare_horizontal_filter_coeff_alpha0(
     int sx, __m128i *coeff) {
   // Filter coeff
   const __m128i tmp_0 = _mm_loadu_si128(
@@ -112,7 +112,7 @@ static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
   coeff[7] = coeff[6];
 }
 
-static INLINE void highbd_filter_src_pixels(
+static inline void highbd_filter_src_pixels(
     const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
     const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
   const __m128i src_1 = *src;
@@ -154,7 +154,7 @@ static INLINE void highbd_filter_src_pixels(
   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
 }
 
-static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+static inline void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
                                        __m128i *tmp, int sx, int alpha, int k,
                                        const int offset_bits_horiz,
                                        const int reduce_bits_horiz) {
@@ -164,7 +164,7 @@ static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
                            reduce_bits_horiz, k);
 }
 
-static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+static inline void highbd_warp_horizontal_filter_alpha0_beta0(
     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -192,7 +192,7 @@ static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
   }
 }
 
-static INLINE void highbd_warp_horizontal_filter_alpha0(
+static inline void highbd_warp_horizontal_filter_alpha0(
     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -219,7 +219,7 @@ static INLINE void highbd_warp_horizontal_filter_alpha0(
   }
 }
 
-static INLINE void highbd_warp_horizontal_filter_beta0(
+static inline void highbd_warp_horizontal_filter_beta0(
     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -245,7 +245,7 @@ static INLINE void highbd_warp_horizontal_filter_beta0(
   }
 }
 
-static INLINE void highbd_warp_horizontal_filter(
+static inline void highbd_warp_horizontal_filter(
     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -269,7 +269,7 @@ static INLINE void highbd_warp_horizontal_filter(
   }
 }
 
-static INLINE void highbd_prepare_warp_horizontal_filter(
+static inline void highbd_prepare_warp_horizontal_filter(
     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 50df3371fc..cd87992ae7 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -24,7 +24,7 @@
 
 #include "av1/common/convolve.h"
 
-static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
@@ -33,7 +33,7 @@ static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   return wt;
 }
 
-static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+static inline __m256i load_line2_avx2(const void *a, const void *b) {
   return _mm256_permute2x128_si256(
       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c
index 9c2ee80bf2..d93b6dd9fa 100644
--- a/av1/common/x86/reconinter_avx2.c
+++ b/av1/common/x86/reconinter_avx2.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "av1/common/blockd.h"
 
-static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+static inline __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
                                      const __m256i s1) {
   const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
   return _mm256_abs_epi16(
@@ -135,7 +135,7 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
   }
 }
 
-static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+static inline __m256i calc_mask_d16_avx2(const __m256i *data_src0,
                                          const __m256i *data_src1,
                                          const __m256i *round_const,
                                          const __m256i *mask_base_16,
@@ -151,7 +151,7 @@ static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
   return diff_clamp;
 }
 
-static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+static inline __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
                                              const __m256i *data_src1,
                                              const __m256i *round_const,
                                              const __m256i *mask_base_16,
@@ -169,7 +169,7 @@ static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
   return diff_const_16;
 }
 
-static INLINE void build_compound_diffwtd_mask_d16_avx2(
+static inline void build_compound_diffwtd_mask_d16_avx2(
     uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
   const int mask_base = 38;
@@ -330,7 +330,7 @@ static INLINE void build_compound_diffwtd_mask_d16_avx2(
   }
 }
 
-static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+static inline void build_compound_diffwtd_mask_d16_inv_avx2(
     uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
   const int mask_base = 38;
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index f343064a7c..b79b77e1fd 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -17,7 +17,7 @@
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"
 
-static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+static inline __m128i calc_mask(const __m128i mask_base, const __m128i s0,
                                 const __m128i s1) {
   const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
   return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index c7d4feff01..2eaa2f8384 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -211,7 +211,7 @@
   _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
                    _mm_unpackhi_epi32(low_128, high_128));
 
-static INLINE void resize_convolve(const __m256i *const s,
+static inline void resize_convolve(const __m256i *const s,
                                    const __m256i *const coeffs,
                                    __m256i *res_out) {
   const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
@@ -234,7 +234,7 @@ static INLINE void resize_convolve(const __m256i *const s,
   res_out[1] = _mm256_add_epi32(dst_01, dst_11);
 }
 
-static INLINE void prepare_filter_coeffs(const int16_t *filter,
+static inline void prepare_filter_coeffs(const int16_t *filter,
                                          __m256i *const coeffs /* [4] */) {
   // f0 f1 f2 f3 x x x x
   const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 81fe0f6ab0..77775896be 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -69,7 +69,7 @@
   l5 = l7;                                                             \
   data += 2 * stride;
 
-static INLINE void prepare_filter_coeffs(const int16_t *filter,
+static inline void prepare_filter_coeffs(const int16_t *filter,
                                          __m128i *const coeffs /* [2] */) {
   // f0 f1 f2 f3 x x x x
   const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
@@ -168,7 +168,7 @@ bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
 }
 
 // Blends a and b using mask and returns the result.
-static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
+static inline __m128i blend(__m128i a, __m128i b, __m128i mask) {
   const __m128i masked_b = _mm_and_si128(mask, b);
   const __m128i masked_a = _mm_andnot_si128(mask, a);
   return (_mm_or_si128(masked_a, masked_b));
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index d23d3dc89d..e8dfa4ad90 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/x86/transpose_sse2.h"
 #include "av1/common/resize.h"
 
-static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+static inline __m128i scale_plane_2_to_1_phase_0_kernel(
     const uint8_t *const src, const __m128i *const mask) {
   const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
   const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
@@ -29,7 +29,7 @@ static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
   return _mm_packus_epi16(a_and, b_and);
 }
 
-static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+static inline void shuffle_filter_odd_ssse3(const int16_t *const filter,
                                             __m128i *const f) {
   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
   // pack and duplicate the filter values
@@ -43,7 +43,7 @@ static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
   f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
 }
 
-static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+static inline __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
                                                     const __m128i *const f) {
   // multiply 2 adjacent elements with the filter and add the result
   const __m128i k_64 = _mm_set1_epi16(1 << 6);
@@ -64,7 +64,7 @@ static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
   return temp;
 }
 
-static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+static inline __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
                                                    const __m128i *const f) {
   // multiply 2 adjacent elements with the filter and add the result
   const __m128i k_64 = _mm_set1_epi16(1 << 6);
@@ -135,7 +135,7 @@ static void scale_plane_4_to_1_phase_0(const uint8_t *src,
   } while (--y);
 }
 
-static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+static inline __m128i scale_plane_bilinear_kernel(const __m128i *const s,
                                                   const __m128i c0c1) {
   const __m128i k_64 = _mm_set1_epi16(1 << 6);
   const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
@@ -696,7 +696,7 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
   } while (x);
 }
 
-static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+static inline __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
                                                   const __m128i *const f) {
   __m128i ss[4], temp;
 
@@ -811,7 +811,7 @@ static void scale_plane_1_to_2_phase_0(const uint8_t *src,
 
 // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
 // in SSSE3.
-static INLINE bool has_normative_scaler_ssse3(const int src_width,
+static inline bool has_normative_scaler_ssse3(const int src_width,
                                               const int src_height,
                                               const int dst_width,
                                               const int dst_height) {
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 4d910c7022..4eacd01c24 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -177,7 +177,7 @@ static void integral_images_highbd(const uint16_t *src, int src_stride,
 
 // Compute 8 values of boxsum from the given integral image. ii should point
 // at the middle of the box (for the first value). r is the box radius.
-static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
   const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
   const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
   const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
@@ -299,7 +299,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
 // cross_sum = 4 * fours + 3 * threes
 //           = 4 * (fours + threes) - threes
 //           = (fours + threes) << 2 - threes
-static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+static inline __m256i cross_sum(const int32_t *buf, int stride) {
   const __m256i xtl = yy_loadu_256(buf - 1 - stride);
   const __m256i xt = yy_loadu_256(buf - stride);
   const __m256i xtr = yy_loadu_256(buf + 1 - stride);
@@ -437,7 +437,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
 // cross_sum = 6 * sixes + 5 * fives
 //           = 5 * (fives + sixes) - sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
   const __m256i xtl = yy_loadu_256(buf - 1 - stride);
   const __m256i xt = yy_loadu_256(buf - stride);
   const __m256i xtr = yy_loadu_256(buf + 1 - stride);
@@ -471,7 +471,7 @@ static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
 // cross_sum = 5 * fives + 6 * sixes
 //           = 4 * (fives + sixes) + (fives + sixes) + sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) {
   const __m256i xl = yy_loadu_256(buf - 1);
   const __m256i x = yy_loadu_256(buf);
   const __m256i xr = yy_loadu_256(buf + 1);
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 2f88ca56a9..65ff8b41a9 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -130,7 +130,7 @@ static void integral_images_highbd(const uint16_t *src, int src_stride,
 
 // Compute 4 values of boxsum from the given integral image. ii should point
 // at the middle of the box (for the first value). r is the box radius.
-static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+static inline __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
   const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
   const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
   const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
@@ -256,7 +256,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
 // cross_sum = 4 * fours + 3 * threes
 //           = 4 * (fours + threes) - threes
 //           = (fours + threes) << 2 - threes
-static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+static inline __m128i cross_sum(const int32_t *buf, int stride) {
   const __m128i xtl = xx_loadu_128(buf - 1 - stride);
   const __m128i xt = xx_loadu_128(buf - stride);
   const __m128i xtr = xx_loadu_128(buf + 1 - stride);
@@ -398,7 +398,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
 // cross_sum = 6 * sixes + 5 * fives
 //           = 5 * (fives + sixes) - sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+static inline __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
   const __m128i xtl = xx_loadu_128(buf - 1 - stride);
   const __m128i xt = xx_loadu_128(buf - stride);
   const __m128i xtr = xx_loadu_128(buf + 1 - stride);
@@ -431,7 +431,7 @@ static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
 // cross_sum = 5 * fives + 6 * sixes
 //           = 4 * (fives + sixes) + (fives + sixes) + sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+static inline __m128i cross_sum_fast_odd_row(const int32_t *buf) {
   const __m128i xl = xx_loadu_128(buf - 1);
   const __m128i x = xx_loadu_128(buf);
   const __m128i xr = xx_loadu_128(buf + 1);
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index d14e175968..a78093971e 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -74,7 +74,7 @@ DECLARE_ALIGNED(32, static const uint8_t,
                                       10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
                                       11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
 
-static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+static inline void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
                                           __m256i *coeff,
                                           const __m256i *shuffle_src,
                                           const __m256i *round_const,
@@ -96,7 +96,7 @@ static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
   horz_out[row] = _mm256_srl_epi16(res, *shift);
 }
 
-static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+static inline void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
                                                         int sx,
                                                         __m256i *coeff) {
   __m128i tmp_0 = _mm_loadl_epi64(
@@ -191,7 +191,7 @@ static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
   coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
 }
 
-static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+static inline void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
                                                               __m256i *coeff) {
   __m128i tmp_0 = _mm_loadl_epi64(
       (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -231,7 +231,7 @@ static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
   coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
 }
 
-static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+static inline void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
                                                                __m256i *coeff) {
   const __m128i tmp_0 =
       _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
@@ -251,7 +251,7 @@ static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
       res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
 }
 
-static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+static inline void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
                                           int sx, int alpha, int beta, int row,
                                           const __m256i *shuffle_src,
                                           const __m256i *round_const,
@@ -261,7 +261,7 @@ static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
   filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
                          row);
 }
-static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
                                                    __m256i *coeff) {
   const __m128i tmp_0 = _mm_loadl_epi64(
       (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -296,7 +296,7 @@ static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
   coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
 }
 
-static INLINE void warp_horizontal_filter_avx2(
+static inline void warp_horizontal_filter_avx2(
     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const __m256i *round_const, const __m128i *shift,
@@ -329,7 +329,7 @@ static INLINE void warp_horizontal_filter_avx2(
                          shift, row);
 }
 
-static INLINE void warp_horizontal_filter_alpha0_avx2(
+static inline void warp_horizontal_filter_alpha0_avx2(
     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const __m256i *round_const, const __m128i *shift,
@@ -364,7 +364,7 @@ static INLINE void warp_horizontal_filter_alpha0_avx2(
                          shift, row);
 }
 
-static INLINE void warp_horizontal_filter_beta0_avx2(
+static inline void warp_horizontal_filter_beta0_avx2(
     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const __m256i *round_const, const __m128i *shift,
@@ -396,7 +396,7 @@ static INLINE void warp_horizontal_filter_beta0_avx2(
                          shift, row);
 }
 
-static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+static inline void warp_horizontal_filter_alpha0_beta0_avx2(
     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const __m256i *round_const, const __m128i *shift,
@@ -428,7 +428,7 @@ static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
                          shift, row);
 }
 
-static INLINE void unpack_weights_and_set_round_const_avx2(
+static inline void unpack_weights_and_set_round_const_avx2(
     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
     __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
   *res_sub_const =
@@ -443,7 +443,7 @@ static INLINE void unpack_weights_and_set_round_const_avx2(
   *wt = _mm256_unpacklo_epi16(wt0, wt1);
 }
 
-static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+static inline void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
                                                        int sy,
                                                        __m256i *coeffs) {
   __m128i filt_00 =
@@ -537,7 +537,7 @@ static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
   coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
 }
 
-static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+static inline void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
                                                               __m256i *coeffs) {
   __m128i filt_00 =
       _mm_loadu_si128((__m128i *)(av1_warped_filter +
@@ -596,7 +596,7 @@ static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
   coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
 }
 
-static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+static inline void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
                                                               __m256i *coeffs) {
   const __m128i filt_0 = _mm_loadu_si128(
       (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
@@ -621,7 +621,7 @@ static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
   coeffs[7] = coeffs[3];
 }
 
-static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+static inline void filter_src_pixels_vertical_avx2(__m256i *horz_out,
                                                    __m256i *src,
                                                    __m256i *coeffs,
                                                    __m256i *res_lo,
@@ -655,7 +655,7 @@ static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
   *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
 }
 
-static INLINE void store_vertical_filter_output_avx2(
+static inline void store_vertical_filter_output_avx2(
     const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
     const __m256i *wt, const __m256i *res_sub_const,
     const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
@@ -776,7 +776,7 @@ static INLINE void store_vertical_filter_output_avx2(
   }
 }
 
-static INLINE void warp_vertical_filter_avx2(
+static inline void warp_vertical_filter_avx2(
     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     int i, int j, int sy4, const int reduce_bits_vert,
@@ -825,7 +825,7 @@ static INLINE void warp_vertical_filter_avx2(
   }
 }
 
-static INLINE void warp_vertical_filter_gamma0_avx2(
+static inline void warp_vertical_filter_gamma0_avx2(
     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     int i, int j, int sy4, const int reduce_bits_vert,
@@ -874,7 +874,7 @@ static INLINE void warp_vertical_filter_gamma0_avx2(
   }
 }
 
-static INLINE void warp_vertical_filter_delta0_avx2(
+static inline void warp_vertical_filter_delta0_avx2(
     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     int i, int j, int sy4, const int reduce_bits_vert,
@@ -922,7 +922,7 @@ static INLINE void warp_vertical_filter_delta0_avx2(
   }
 }
 
-static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+static inline void warp_vertical_filter_gamma0_delta0_avx2(
     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     int i, int j, int sy4, const int reduce_bits_vert,
@@ -970,7 +970,7 @@ static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
   }
 }
 
-static INLINE void prepare_warp_vertical_filter_avx2(
+static inline void prepare_warp_vertical_filter_avx2(
     uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
     int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
     int i, int j, int sy4, const int reduce_bits_vert,
@@ -999,7 +999,7 @@ static INLINE void prepare_warp_vertical_filter_avx2(
                               res_sub_const, round_bits_const, wt);
 }
 
-static INLINE void prepare_warp_horizontal_filter_avx2(
+static inline void prepare_warp_horizontal_filter_avx2(
     const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const __m256i *round_const, const __m128i *shift,
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index cc7cdc416c..eb02683ecd 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -181,7 +181,7 @@ DECLARE_ALIGNED(16, static const uint8_t,
                 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
                                               12, 13, 14, 15, 12, 13, 14, 15 };
 
-static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
                                      const int reduce_bits_horiz, int k) {
   const __m128i src_even =
@@ -223,7 +223,7 @@ static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
   tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
 }
 
-static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
                                                    __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadl_epi64(
@@ -271,7 +271,7 @@ static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
   coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
 }
 
-static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
                                                           __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 =
@@ -291,7 +291,7 @@ static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
 }
 
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
                                      int alpha, int k,
                                      const int offset_bits_horiz,
                                      const int reduce_bits_horiz) {
@@ -300,7 +300,7 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
   filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
 }
 
-static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
                                           int stride, int32_t ix4, int32_t iy4,
                                           int32_t sx4, int alpha, int beta,
                                           int p_height, int height, int i,
@@ -323,7 +323,7 @@ static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
   }
 }
 
-static INLINE void warp_horizontal_filter_alpha0(
+static inline void warp_horizontal_filter_alpha0(
     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -347,7 +347,7 @@ static INLINE void warp_horizontal_filter_alpha0(
   }
 }
 
-static INLINE void warp_horizontal_filter_beta0(
+static inline void warp_horizontal_filter_beta0(
     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -370,7 +370,7 @@ static INLINE void warp_horizontal_filter_beta0(
   }
 }
 
-static INLINE void warp_horizontal_filter_alpha0_beta0(
+static inline void warp_horizontal_filter_alpha0_beta0(
     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
@@ -395,7 +395,7 @@ static INLINE void warp_horizontal_filter_alpha0_beta0(
   }
 }
 
-static INLINE void unpack_weights_and_set_round_const(
+static inline void unpack_weights_and_set_round_const(
     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
     __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
   *res_sub_const =
@@ -410,7 +410,7 @@ static INLINE void unpack_weights_and_set_round_const(
   *wt = _mm_unpacklo_epi16(wt0, wt1);
 }
 
-static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
                                                   __m128i *coeffs) {
   const __m128i tmp_0 =
       _mm_loadu_si128((__m128i *)(av1_warped_filter +
@@ -461,7 +461,7 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
   coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
 }
 
-static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
                                                          __m128i *coeffs) {
   const __m128i tmp_0 = _mm_loadu_si128(
       (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
@@ -483,7 +483,7 @@ static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
   coeffs[7] = coeffs[3];
 }
 
-static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
                                               __m128i *res_lo, __m128i *res_hi,
                                               int k) {
   // Load from tmp and rearrange pairs of consecutive rows into the
@@ -521,7 +521,7 @@ static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
   *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 }
 
-static INLINE void store_vertical_filter_output(
+static inline void store_vertical_filter_output(
     __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
     const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
     uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
@@ -617,7 +617,7 @@ static INLINE void store_vertical_filter_output(
   }
 }
 
-static INLINE void warp_vertical_filter(
+static inline void warp_vertical_filter(
     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
@@ -644,7 +644,7 @@ static INLINE void warp_vertical_filter(
   }
 }
 
-static INLINE void warp_vertical_filter_gamma0(
+static inline void warp_vertical_filter_gamma0(
     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
@@ -672,7 +672,7 @@ static INLINE void warp_vertical_filter_gamma0(
   }
 }
 
-static INLINE void warp_vertical_filter_delta0(
+static inline void warp_vertical_filter_delta0(
     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
@@ -698,7 +698,7 @@ static INLINE void warp_vertical_filter_delta0(
   }
 }
 
-static INLINE void warp_vertical_filter_gamma0_delta0(
+static inline void warp_vertical_filter_gamma0_delta0(
     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
@@ -725,7 +725,7 @@ static INLINE void warp_vertical_filter_gamma0_delta0(
   }
 }
 
-static INLINE void prepare_warp_vertical_filter(
+static inline void prepare_warp_vertical_filter(
     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
@@ -748,7 +748,7 @@ static INLINE void prepare_warp_vertical_filter(
                          res_add_const, round_bits, offset_bits);
 }
 
-static INLINE void prepare_warp_horizontal_filter(
+static inline void prepare_warp_horizontal_filter(
     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
     const int offset_bits_horiz, const int reduce_bits_horiz) {
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 759a95c21d..95f6543666 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -481,7 +481,7 @@ static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
   } while (--b_h);
 }
 
-static INLINE int update_extend_mc_border_params(
+static inline int update_extend_mc_border_params(
     const struct scale_factors *const sf, struct buf_2d *const pre_buf,
     MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
     int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
@@ -516,7 +516,7 @@ static INLINE int update_extend_mc_border_params(
   return 0;
 }
 
-static INLINE void extend_mc_border(const struct scale_factors *const sf,
+static inline void extend_mc_border(const struct scale_factors *const sf,
                                     struct buf_2d *const pre_buf,
                                     MV32 scaled_mv, PadBlock block,
                                     int subpel_x_mv, int subpel_y_mv,
@@ -696,7 +696,7 @@ static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
   }
 }
 
-static INLINE void dec_build_prediction_by_above_pred(
+static inline void dec_build_prediction_by_above_pred(
     MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
@@ -752,7 +752,7 @@ static AOM_INLINE void dec_build_prediction_by_above_preds(
   xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
 }
 
-static INLINE void dec_build_prediction_by_left_pred(
+static inline void dec_build_prediction_by_left_pred(
     MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
@@ -1774,7 +1774,7 @@ static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
   }
 }
 
-static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+static inline int read_delta_q(struct aom_read_bit_buffer *rb) {
   return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
 }
 
@@ -1998,7 +1998,7 @@ static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params,
   set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
 }
 
-static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+static inline int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
                                           int ref_xss, int ref_yss,
                                           aom_bit_depth_t this_bit_depth,
                                           int this_xss, int this_yss) {
@@ -2481,7 +2481,7 @@ static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi,
 }
 
 // Set up nsync by width.
-static INLINE int get_sync_range(int width) {
+static inline int get_sync_range(int width) {
 // nsync numbers are picked by testing.
 #if 0
   if (width < 640)
@@ -2557,7 +2557,7 @@ void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
   }
 }
 
-static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+static inline void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
                              int c) {
 #if CONFIG_MULTITHREAD
   const int nsync = dec_row_mt_sync->sync_range;
@@ -2579,7 +2579,7 @@ static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+static inline void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
                               int c, const int sb_cols) {
 #if CONFIG_MULTITHREAD
   const int nsync = dec_row_mt_sync->sync_range;
@@ -2609,7 +2609,7 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void signal_decoding_done_for_erroneous_row(
+static inline void signal_decoding_done_for_erroneous_row(
     AV1Decoder *const pbi, const MACROBLOCKD *const xd) {
   AV1_COMMON *const cm = &pbi->common;
   const TileInfo *const tile = &xd->tile;
@@ -3001,7 +3001,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   return !td->dcb.corrupted;
 }
 
-static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+static inline int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
                                                   const TileInfo *tile) {
   // NOTE: Currently value of max workers is calculated based
   // on the parse and decode time. As per the theoretical estimate
@@ -3129,7 +3129,7 @@ static int get_next_job_info(AV1Decoder *const pbi,
   return 1;
 }
 
-static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+static inline void signal_parse_sb_row_done(AV1Decoder *const pbi,
                                             TileDataDec *const tile_data,
                                             const int sb_mi_size) {
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
@@ -4458,7 +4458,7 @@ static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi,
   cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 }
 
-static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+static inline void reset_frame_buffers(AV1_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 41f07566a5..7d8971d1ce 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -668,12 +668,12 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
   }
 }
 
-static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+static inline void read_mv(aom_reader *r, MV *mv, const MV *ref,
                            nmv_context *ctx, MvSubpelPrecision precision);
 
-static INLINE int is_mv_valid(const MV *mv);
+static inline int is_mv_valid(const MV *mv);
 
-static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+static inline int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
                             const int_mv *ref_mv, int mi_row, int mi_col,
                             BLOCK_SIZE bsize, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -882,7 +882,7 @@ static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+static inline void read_mv(aom_reader *r, MV *mv, const MV *ref,
                            nmv_context *ctx, MvSubpelPrecision precision) {
   MV diff = kZeroMv;
   const MV_JOINT_TYPE joint_type =
@@ -1030,7 +1030,7 @@ static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
-static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd,
+static inline void read_mb_interp_filter(const MACROBLOCKD *const xd,
                                          InterpFilter interp_filter,
                                          bool enable_dual_filter,
                                          MB_MODE_INFO *const mbmi,
@@ -1105,12 +1105,12 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm,
   read_filter_intra_mode_info(cm, xd, r);
 }
 
-static INLINE int is_mv_valid(const MV *mv) {
+static inline int is_mv_valid(const MV *mv) {
   return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
          mv->col < MV_UPP;
 }
 
-static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+static inline int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
                             PREDICTION_MODE mode,
                             MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
                             int_mv ref_mv[2], int_mv nearest_mv[2],
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index a584753223..d6610cd03e 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -401,7 +401,7 @@ void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
 
 void av1_dec_free_cb_buf(AV1Decoder *pbi);
 
-static INLINE void decrease_ref_count(RefCntBuffer *const buf,
+static inline void decrease_ref_count(RefCntBuffer *const buf,
                                       BufferPool *const pool) {
   if (buf != NULL) {
     --buf->ref_count;
@@ -422,7 +422,7 @@ static INLINE void decrease_ref_count(RefCntBuffer *const buf,
 }
 
 #define ACCT_STR __func__
-static INLINE int av1_read_uniform(aom_reader *r, int n) {
+static inline int av1_read_uniform(aom_reader *r, int n) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   const int v = aom_read_literal(r, l - 1, ACCT_STR);
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index fbaeb8cc6e..465af7e06b 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -42,7 +42,7 @@ static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
   return x - 1;
 }
 
-static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+static inline int rec_eob_pos(const int eob_token, const int extra) {
   int eob = av1_eob_group_start[eob_token];
   if (eob > 2) {
     eob += extra;
@@ -50,7 +50,7 @@ static INLINE int rec_eob_pos(const int eob_token, const int extra) {
   return eob;
 }
 
-static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+static inline int get_dqv(const int16_t *dequant, int coeff_idx,
                           const qm_val_t *iqmatrix) {
   int dqv = dequant[!!coeff_idx];
   if (iqmatrix != NULL)
@@ -59,7 +59,7 @@ static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
   return dqv;
 }
 
-static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+static inline void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
                                           int start_si, int end_si,
                                           const int16_t *scan, int bhl,
                                           uint8_t *levels,
@@ -83,7 +83,7 @@ static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
   }
 }
 
-static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+static inline void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
                                        TX_CLASS tx_class, int start_si,
                                        int end_si, const int16_t *scan, int bhl,
                                        uint8_t *levels, base_cdf_arr base_cdf,
diff --git a/av1/decoder/grain_synthesis.c b/av1/decoder/grain_synthesis.c
index e18bdfb82f..49fc04d7e4 100644
--- a/av1/decoder/grain_synthesis.c
+++ b/av1/decoder/grain_synthesis.c
@@ -430,7 +430,7 @@ static bool init_arrays(const aom_film_grain_t *params, int luma_stride,
 }
 
 // get a number between 0 and 2^bits - 1
-static INLINE int get_random_number(int bits) {
+static inline int get_random_number(int bits) {
   uint16_t bit;
   bit = ((random_register >> 0) ^ (random_register >> 1) ^
          (random_register >> 3) ^ (random_register >> 12)) &
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index e2e24fab30..245af6ccc6 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -311,12 +311,12 @@ void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
 
 int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
 
-static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+static inline int cyclic_refresh_segment_id_boosted(int segment_id) {
   return segment_id == CR_SEGMENT_ID_BOOST1 ||
          segment_id == CR_SEGMENT_ID_BOOST2;
 }
 
-static INLINE int cyclic_refresh_segment_id(int segment_id) {
+static inline int cyclic_refresh_segment_id(int segment_id) {
   if (segment_id == CR_SEGMENT_ID_BOOST1)
     return CR_SEGMENT_ID_BOOST1;
   else if (segment_id == CR_SEGMENT_ID_BOOST2)
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
index 6710f84023..c1016db5b8 100644
--- a/av1/encoder/arm/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/av1_highbd_quantize_neon.c
@@ -18,7 +18,7 @@
 #include "av1/common/quant_common.h"
 #include "av1/encoder/av1_quantize.h"
 
-static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+static inline uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
                                     tran_low_t *qcoeff_ptr,
                                     tran_low_t *dqcoeff_ptr,
                                     int32x4_t v_quant_s32,
@@ -57,7 +57,7 @@ static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
   return vmovn_u32(nz_qcoeff_mask);
 }
 
-static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
   const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
@@ -66,7 +66,7 @@ static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
-static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
diff --git a/av1/encoder/arm/av1_temporal_denoiser_neon.c b/av1/encoder/arm/av1_temporal_denoiser_neon.c
index a3f60a4442..dbe9f9e4ef 100644
--- a/av1/encoder/arm/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/av1_temporal_denoiser_neon.c
@@ -23,7 +23,7 @@
 #include "av1/encoder/av1_temporal_denoiser.h"
 
 // Compute the sum of all pixel differences of this MB.
-static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+static inline int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
 #if AOM_ARCH_AARCH64
   return vaddlvq_s8(v_sum_diff_total);
 #else
@@ -38,7 +38,7 @@ static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
 }
 
 // Denoise a 16x1 vector.
-static INLINE int8x16_t denoiser_16x1_neon(
+static inline int8x16_t denoiser_16x1_neon(
     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
     const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
     const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
@@ -100,7 +100,7 @@ static INLINE int8x16_t denoiser_16x1_neon(
   return v_sum_diff_total;
 }
 
-static INLINE int8x16_t denoiser_adjust_16x1_neon(
+static inline int8x16_t denoiser_adjust_16x1_neon(
     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
     const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
   uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
diff --git a/av1/encoder/arm/cnn_neon.c b/av1/encoder/arm/cnn_neon.c
index 041d86525b..5e24b6916e 100644
--- a/av1/encoder/arm/cnn_neon.c
+++ b/av1/encoder/arm/cnn_neon.c
@@ -768,13 +768,13 @@ static const float weights_layer_5[] = {
   0.565984f,  0.592690f,
 };
 
-static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) {
+static inline float32x4_t add_f32x4_x4(const float32x4_t a[4]) {
   float32x4_t sum01 = vaddq_f32(a[0], a[1]);
   float32x4_t sum23 = vaddq_f32(a[2], a[3]);
   return vaddq_f32(sum01, sum23);
 }
 
-static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+static inline void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
     const float **input, int in_width, int in_height, int in_stride,
     const float *bias, const int skip_width, const int skip_height,
     const int filter_width, const int filter_height, const int in_channels,
@@ -874,7 +874,7 @@ static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
   } while (++start_idx < out_channels);
 }
 
-static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+static inline void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
     const float **input, int in_width, int in_height, int in_stride,
     const float *bias, const int skip_width, const int skip_height,
     const int filter_width, const int filter_height, const int in_channels,
@@ -951,7 +951,7 @@ static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
   } while (++start_idx < out_channels);
 }
 
-static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+static inline void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
     const float **input, int in_width, int in_height, int in_stride,
     const float *bias, const int skip_width, const int skip_height,
     const int filter_width, const int filter_height, const int in_channels,
diff --git a/av1/encoder/arm/encodetxb_neon.c b/av1/encoder/arm/encodetxb_neon.c
index 29ca087ede..8486c76abb 100644
--- a/av1/encoder/arm/encodetxb_neon.c
+++ b/av1/encoder/arm/encodetxb_neon.c
@@ -179,7 +179,7 @@ static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
 
 // end of coefficients declaration area
 
-static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
                                                 const int byte_stride) {
 #if AOM_ARCH_AARCH64
   uint32x4_t v_data = vld1q_u32((uint32_t *)src);
@@ -193,7 +193,7 @@ static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
 #endif
 }
 
-static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
                                                 const int byte_stride) {
 #if AOM_ARCH_AARCH64
   uint64x2_t v_data = vld1q_u64((uint64_t *)src);
@@ -208,13 +208,13 @@ static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
 #endif
 }
 
-static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
                                                  const int byte_stride) {
   (void)byte_stride;
   return vld1q_u8(src);
 }
 
-static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+static inline void load_levels_4x4x5(const uint8_t *const src, const int stride,
                                      const ptrdiff_t *const offsets,
                                      uint8x16_t *const level) {
   level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
@@ -224,7 +224,7 @@ static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
   level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
 }
 
-static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+static inline void load_levels_8x2x5(const uint8_t *const src, const int stride,
                                      const ptrdiff_t *const offsets,
                                      uint8x16_t *const level) {
   level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
@@ -234,7 +234,7 @@ static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
   level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
 }
 
-static INLINE void load_levels_16x1x5(const uint8_t *const src,
+static inline void load_levels_16x1x5(const uint8_t *const src,
                                       const int stride,
                                       const ptrdiff_t *const offsets,
                                       uint8x16_t *const level) {
@@ -245,7 +245,7 @@ static INLINE void load_levels_16x1x5(const uint8_t *const src,
   level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
 }
 
-static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
   const uint8x16_t const_3 = vdupq_n_u8(3);
   const uint8x16_t const_4 = vdupq_n_u8(4);
   uint8x16_t count;
@@ -265,7 +265,7 @@ static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
   return count;
 }
 
-static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *const coeff_contexts) {
@@ -296,7 +296,7 @@ static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
                                              const int width,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
@@ -322,7 +322,7 @@ static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
                                              const int width,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
@@ -349,7 +349,7 @@ static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
                                            const int width,
                                            const ptrdiff_t *const offsets,
                                            uint8_t *coeff_contexts) {
@@ -389,7 +389,7 @@ static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
@@ -414,7 +414,7 @@ static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
@@ -442,7 +442,7 @@ static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
                                              const int real_width,
                                              const int real_height,
                                              const int width, const int height,
@@ -506,7 +506,7 @@ static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               uint8_t *coeff_contexts) {
@@ -539,7 +539,7 @@ static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
   } while (--col);
 }
 
-static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               uint8_t *coeff_contexts) {
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index cfc5e0c7e8..3d69e200cf 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -18,7 +18,7 @@
 #include "av1/encoder/arm/pickrst_neon.h"
 #include "av1/encoder/pickrst.h"
 
-static INLINE void highbd_calc_proj_params_r0_r1_neon(
+static inline void highbd_calc_proj_params_r0_r1_neon(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -115,7 +115,7 @@ static INLINE void highbd_calc_proj_params_r0_r1_neon(
   C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
 }
 
-static INLINE void highbd_calc_proj_params_r0_neon(
+static inline void highbd_calc_proj_params_r0_neon(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -180,7 +180,7 @@ static INLINE void highbd_calc_proj_params_r0_neon(
   C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
 }
 
-static INLINE void highbd_calc_proj_params_r1_neon(
+static inline void highbd_calc_proj_params_r1_neon(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -272,7 +272,7 @@ void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
   }
 }
 
-static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
+static inline int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
 #if AOM_ARCH_AARCH64
   uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } };
   return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx));
@@ -286,7 +286,7 @@ static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
 #endif
 }
 
-static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
+static inline int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
                               uint8x16_t idx) {
 #if AOM_ARCH_AARCH64
   uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b),
@@ -307,7 +307,7 @@ static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
 #endif
 }
 
-static INLINE int64_t div_shift_s64(int64_t x, int power) {
+static inline int64_t div_shift_s64(int64_t x, int power) {
   return (x < 0 ? x + (1ll << power) - 1 : x) >> power;
 }
 
@@ -315,7 +315,7 @@ static INLINE int64_t div_shift_s64(int64_t x, int power) {
 // speed up the computation. This function computes the final M from the
 // accumulated (src_s64) and the residual parts (src_s32). It also transposes
 // the result as the output needs to be column-major.
-static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+static inline void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
                                    const int32_t *src_s32, const int wiener_win,
                                    int shift) {
   for (int i = 0; i < wiener_win; ++i) {
@@ -335,7 +335,7 @@ static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
 // the accumulated (src_s64) and the residual parts (src_s32). The computed H is
 // only an upper triangle matrix, this function also fills the lower triangle of
 // the resulting matrix.
-static INLINE void update_H(int64_t *dst, const int64_t *src_s64,
+static inline void update_H(int64_t *dst, const int64_t *src_s64,
                             const int32_t *src_s32, const int wiener_win,
                             int stride, int shift) {
   // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
@@ -388,7 +388,7 @@ static INLINE void update_H(int64_t *dst, const int64_t *src_s64,
 
 // Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load
 // address is offset to prevent out-of-bounds access.
-static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
+static inline void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
                                          ptrdiff_t stride) {
   dst[0] = vld1q_s16(src);
   src += stride;
@@ -405,7 +405,7 @@ static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
   dst[6] = vld1q_s16(src - 1);
 }
 
-static INLINE void highbd_compute_stats_win7_neon(
+static inline void highbd_compute_stats_win7_neon(
     const uint16_t *dgd, const uint16_t *src, int avg, int width, int height,
     int dgd_stride, int src_stride, int64_t *M, int64_t *H,
     aom_bit_depth_t bit_depth) {
@@ -684,7 +684,7 @@ static INLINE void highbd_compute_stats_win7_neon(
 
 // Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load
 // address is offset to prevent out-of-bounds access.
-static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src,
+static inline void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src,
                                          ptrdiff_t stride) {
   dst[0] = vld1q_s16(src);
   src += stride;
diff --git a/av1/encoder/arm/highbd_pickrst_sve.c b/av1/encoder/arm/highbd_pickrst_sve.c
index fc2c24d917..ef7089ecab 100644
--- a/av1/encoder/arm/highbd_pickrst_sve.c
+++ b/av1/encoder/arm/highbd_pickrst_sve.c
@@ -24,7 +24,7 @@
 #include "av1/encoder/pickrst.h"
 #include "av1/encoder/arm/pickrst_sve.h"
 
-static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
+static inline uint16_t find_average_sve(const uint16_t *src, int src_stride,
                                         int width, int height) {
   uint64x2_t avg_u64 = vdupq_n_u64(0);
   uint16x8_t ones = vdupq_n_u16(1);
@@ -51,7 +51,7 @@ static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
   return (uint16_t)(vaddvq_u64(avg_u64) / (width * height));
 }
 
-static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
+static inline void compute_sub_avg(const uint16_t *buf, int buf_stride,
                                    int16_t avg, int16_t *buf_avg,
                                    int buf_avg_stride, int width, int height) {
   uint16x8_t avg_u16 = vdupq_n_u16(avg);
@@ -81,7 +81,7 @@ static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
   } while (--height > 0);
 }
 
-static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
                                        const int wiener_win2,
                                        const int divider) {
   for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
@@ -113,7 +113,7 @@ static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
 }
 
 // Transpose the matrix that has just been computed and accumulate it in M.
-static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn,
                                    const int wiener_win, const int divider) {
   for (int i = 0; i < wiener_win; ++i) {
     for (int j = 0; j < wiener_win; ++j) {
@@ -140,7 +140,7 @@ static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
 // by taking each different pair of columns, and multiplying all the elements of
 // the first one with all the elements of the second one, with a special case
 // when multiplying a column by itself.
-static INLINE void highbd_compute_stats_win7_sve(
+static inline void highbd_compute_stats_win7_sve(
     int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
     int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
   const int wiener_win = 7;
@@ -270,7 +270,7 @@ static INLINE void highbd_compute_stats_win7_sve(
 // by taking each different pair of columns, and multiplying all the elements of
 // the first one with all the elements of the second one, with a special case
 // when multiplying a column by itself.
-static INLINE void highbd_compute_stats_win5_sve(
+static inline void highbd_compute_stats_win5_sve(
     int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
     int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
   const int wiener_win = 5;
diff --git a/av1/encoder/arm/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c
index 2b2b189a48..59cd3fc7fc 100644
--- a/av1/encoder/arm/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/highbd_temporal_filter_neon.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-static INLINE void get_squared_error(
+static inline void get_squared_error(
     const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
     const uint32_t stride2, const uint32_t block_width,
     const uint32_t block_height, uint32_t *frame_sse,
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index b6fba99d8d..63551cc556 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -181,7 +181,7 @@ int64_t av1_lowbd_pixel_proj_error_neon(
 // can be as high as 16384 for the compute stats.
 #define STAT_ACCUMULATOR_MAX 16384
 
-static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
+static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
 #if AOM_ARCH_AARCH64
   uint8x16x2_t table = { { a, b } };
   return vqtbl2_u8(table, idx);
@@ -192,7 +192,7 @@ static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
 #endif
 }
 
-static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
+static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
 #if AOM_ARCH_AARCH64
   uint8x16x2_t table = { { a, b } };
   return vqtbl2q_u8(table, idx);
@@ -208,7 +208,7 @@ static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
 // computation. This function computes the final M from the accumulated
 // (src_s64) and the residual parts (src_s32). It also transposes the result as
 // the output needs to be column-major.
-static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+static inline void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
                                    const int32_t *src_s32, const int wiener_win,
                                    int scale) {
   for (int i = 0; i < wiener_win; ++i) {
@@ -281,7 +281,7 @@ static void update_H(int64_t *dst, const int64_t *src_s64,
 
 // Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
 // last load address is offset to prevent out-of-bounds access.
-static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
+static inline void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
                                         ptrdiff_t stride) {
   dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
   src += 2 * stride;
@@ -292,7 +292,7 @@ static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
   dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
 }
 
-static INLINE void compute_stats_win7_neon(const uint8_t *dgd,
+static inline void compute_stats_win7_neon(const uint8_t *dgd,
                                            const uint8_t *src, int width,
                                            int height, int dgd_stride,
                                            int src_stride, int avg, int64_t *M,
@@ -580,7 +580,7 @@ static INLINE void compute_stats_win7_neon(const uint8_t *dgd,
 
 // Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
 // last load address is offset to prevent out-of-bounds access.
-static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
+static inline void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
                                         ptrdiff_t stride) {
   dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
   src += 2 * stride;
@@ -589,7 +589,7 @@ static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
   dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
 }
 
-static INLINE void compute_stats_win5_neon(const uint8_t *dgd,
+static inline void compute_stats_win5_neon(const uint8_t *dgd,
                                            const uint8_t *src, int width,
                                            int height, int dgd_stride,
                                            int src_stride, int avg, int64_t *M,
@@ -826,7 +826,7 @@ static INLINE void compute_stats_win5_neon(const uint8_t *dgd,
            downsample_factor);
 }
 
-static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+static inline uint8_t find_average_neon(const uint8_t *src, int src_stride,
                                         int width, int height) {
   uint64_t sum = 0;
 
@@ -986,7 +986,7 @@ void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
   }
 }
 
-static INLINE void calc_proj_params_r0_r1_neon(
+static inline void calc_proj_params_r0_r1_neon(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -1075,7 +1075,7 @@ static INLINE void calc_proj_params_r0_r1_neon(
   C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
 }
 
-static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+static inline void calc_proj_params_r0_neon(const uint8_t *src8, int width,
                                             int height, int src_stride,
                                             const uint8_t *dat8, int dat_stride,
                                             int32_t *flt0, int flt0_stride,
@@ -1133,7 +1133,7 @@ static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
   C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
 }
 
-static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+static inline void calc_proj_params_r1_neon(const uint8_t *src8, int width,
                                             int height, int src_stride,
                                             const uint8_t *dat8, int dat_stride,
                                             int32_t *flt1, int flt1_stride,
diff --git a/av1/encoder/arm/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h
index fd6fedb4e9..945593008c 100644
--- a/av1/encoder/arm/pickrst_neon.h
+++ b/av1/encoder/arm/pickrst_neon.h
@@ -25,7 +25,7 @@
 
 // Compute 8 values of M (cross correlation) for a single source pixel and
 // accumulate.
-static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
+static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
                                    int16x8_t dgd_avg) {
   int32x4_t lo = vld1q_s32(M_s32 + 0);
   int32x4_t hi = vld1q_s32(M_s32 + 4);
@@ -39,7 +39,7 @@ static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
 
 // Compute 8 values of M (cross correlation) for two source pixels and
 // accumulate.
-static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
+static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
                                     int16x4_t src_avg1, int16x8_t dgd_avg0,
                                     int16x8_t dgd_avg1) {
   int32x4_t lo = vld1q_s32(M_s32 + 0);
@@ -54,7 +54,7 @@ static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
   vst1q_s32(M_s32 + 4, hi);
 }
 
-static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
+static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
                                    int width, int height) {
   for (int i = 0; i < height; i += 4) {
     int16x4_t di = vld1_s16(dgd_avg + i);
@@ -80,7 +80,7 @@ static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
   }
 }
 
-static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
                                         const int16_t *dgd_avg1) {
   for (int i = 0; i < 24; i += 4) {
     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
@@ -112,7 +112,7 @@ static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
   }
 }
 
-static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
                                         const int16_t *dgd_avg1) {
   for (int i = 0; i < 48; i += 4) {
     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
@@ -164,7 +164,7 @@ static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
 }
 
 // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
-static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src,
+static inline void accumulate_and_clear(int64_t *dst, int32_t *src,
                                         int length) {
   do {
     int32x4_t s32 = vld1q_s32(src);
diff --git a/av1/encoder/arm/pickrst_sve.c b/av1/encoder/arm/pickrst_sve.c
index ed3cb5223d..50d4961bc7 100644
--- a/av1/encoder/arm/pickrst_sve.c
+++ b/av1/encoder/arm/pickrst_sve.c
@@ -25,7 +25,7 @@
 #include "av1/encoder/pickrst.h"
 #include "av1/encoder/arm/pickrst_sve.h"
 
-static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
+static inline uint8_t find_average_sve(const uint8_t *src, int src_stride,
                                        int width, int height) {
   uint32x4_t avg_u32 = vdupq_n_u32(0);
   uint8x16_t ones = vdupq_n_u8(1);
@@ -52,7 +52,7 @@ static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
   return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height));
 }
 
-static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
+static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
                                    int16_t *buf_avg, int buf_avg_stride,
                                    int width, int height,
                                    int downsample_factor) {
@@ -84,7 +84,7 @@ static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
   } while (height > 0);
 }
 
-static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
                                        const int wiener_win2, const int scale) {
   for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
     // Transpose the first 2x2 square. It needs a special case as the element
@@ -115,7 +115,7 @@ static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
 }
 
 // Transpose the matrix that has just been computed and accumulate it in M.
-static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn,
                                    const int wiener_win, int scale) {
   for (int i = 0; i < wiener_win; ++i) {
     for (int j = 0; j < wiener_win; ++j) {
@@ -142,7 +142,7 @@ static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
 // by taking each different pair of columns, and multiplying all the elements of
 // the first one with all the elements of the second one, with a special case
 // when multiplying a column by itself.
-static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
+static inline void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
                                           int16_t *src_avg, int src_avg_stride,
                                           int width, int height, int64_t *M,
                                           int64_t *H, int downsample_factor) {
@@ -276,7 +276,7 @@ static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
 // by taking each different pair of columns, and multiplying all the elements of
 // the first one with all the elements of the second one, with a special case
 // when multiplying a column by itself.
-static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride,
+static inline void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride,
                                           int16_t *src_avg, int src_avg_stride,
                                           int width, int height, int64_t *M,
                                           int64_t *H, int downsample_factor) {
diff --git a/av1/encoder/arm/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h
index 5d629ee369..d5b5330300 100644
--- a/av1/encoder/arm/pickrst_sve.h
+++ b/av1/encoder/arm/pickrst_sve.h
@@ -19,7 +19,7 @@
 
 // Swap each half of the dgd vectors so that we can accumulate the result of
 // the dot-products directly in the destination matrix.
-static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
+static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
   int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
       vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
   int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
@@ -28,7 +28,7 @@ static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
   return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
 }
 
-static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
+static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
                                           int64_t *M, int row) {
   const int wiener_win = 5;
 
@@ -50,7 +50,7 @@ static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
   M[row * wiener_win + 4] += vaddvq_s64(m4);
 }
 
-static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
+static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
                                           int64_t *M, int row) {
   const int wiener_win = 7;
 
@@ -79,7 +79,7 @@ static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
   M[row * wiener_win + 6] += vaddvq_s64(m6);
 }
 
-static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
                                      const int wiener_win,
                                      const int wiener_win2) {
   for (int row0 = 0; row0 < wiener_win; row0++) {
@@ -93,7 +93,7 @@ static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
   }
 }
 
-static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
+static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
                                            int row0, int row1, int64_t *H) {
   for (int col0 = 0; col0 < 5; col0++) {
     int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
@@ -117,7 +117,7 @@ static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
   }
 }
 
-static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
+static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
                                            int row0, int row1, int64_t *H) {
   for (int col0 = 0; col0 < 7; col0++) {
     int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index a9194a9b59..cbeafc798a 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c
@@ -27,7 +27,7 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
-static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
@@ -45,7 +45,7 @@ static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
 #endif
 }
 
-static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
   const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
@@ -54,7 +54,7 @@ static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
-static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
+static inline uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
                                        tran_low_t *qcoeff_ptr,
                                        tran_low_t *dqcoeff_ptr,
                                        int16x8_t v_quant, int16x8_t v_dequant,
@@ -118,7 +118,7 @@ void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
   *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
-static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
+static inline uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
                                        int16_t *qcoeff_ptr,
                                        int16_t *dqcoeff_ptr, int16x8_t v_quant,
                                        int16x8_t v_dequant, int16x8_t v_round,
diff --git a/av1/encoder/arm/rdopt_neon.c b/av1/encoder/arm/rdopt_neon.c
index e96c7ba23c..5199b2e7c8 100644
--- a/av1/encoder/arm/rdopt_neon.c
+++ b/av1/encoder/arm/rdopt_neon.c
@@ -21,7 +21,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+inline static void horver_correlation_4x4(const int16_t *diff, int stride,
                                           int32x4_t *xy_sum_32,
                                           int32x4_t *xz_sum_32,
                                           int32x4_t *x_sum_32,
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 08746b5a9b..f91f4d0a7a 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -30,7 +30,7 @@ DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
   0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
 };
 
-static INLINE void get_squared_error(
+static inline void get_squared_error(
     const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
     const uint32_t stride2, const uint32_t block_width,
     const uint32_t block_height, uint16_t *frame_sse,
@@ -60,7 +60,7 @@ static INLINE void get_squared_error(
   } while (++i < block_height);
 }
 
-static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
+static inline uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
                                       const uint32_t block_width) {
   uint16x8_t s = vld1q_u16(src);
 
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index 5fadeb9dc8..63e5602a91 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -41,7 +41,7 @@ DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
 
 // clang-format on
 
-static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+static inline void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
                                 const uint8_t *frame2, const uint32_t stride2,
                                 const uint32_t block_width,
                                 const uint32_t block_height,
diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index 625b6298e9..0d44341db0 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c
@@ -20,7 +20,7 @@
 #include "av1/encoder/av1_fwd_txfm1d.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 
-static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+static inline TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_fdct4;
     case TXFM_TYPE_DCT8: return av1_fdct8;
@@ -53,7 +53,7 @@ void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
   }
 }
 
-static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+static inline void fwd_txfm2d_c(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_FLIP_CFG *cfg,
                                 int32_t *buf, int bd) {
   int c, r;
@@ -386,7 +386,7 @@ static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
   fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
 };
 
-static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+static inline void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
   av1_zero(cfg->stage_range_col);
   av1_zero(cfg->stage_range_row);
 
diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index ca4c768d4e..52a87f202d 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c
@@ -26,7 +26,7 @@
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
 // For SVC: only do noise estimation on top spatial layer.
-static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+static inline int noise_est_svc(const struct AV1_COMP *const cpi) {
   return (!cpi->ppi->use_svc ||
           (cpi->ppi->use_svc &&
            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 382d07c5b0..b39be5b194 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -514,7 +514,7 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
   }
 }
 
-static INLINE void highbd_quantize_dc(
+static inline void highbd_quantize_dc(
     const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
     const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
@@ -674,7 +674,7 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
   }
 }
 
-static INLINE bool deltaq_params_have_changed(
+static inline bool deltaq_params_have_changed(
     const DeltaQuantParams *prev_deltaq_params,
     const CommonQuantParams *quant_params) {
   return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q ||
diff --git a/av1/encoder/av1_temporal_denoiser.h b/av1/encoder/av1_temporal_denoiser.h
index b3d2e4f31e..982e43b470 100644
--- a/av1/encoder/av1_temporal_denoiser.h
+++ b/av1/encoder/av1_temporal_denoiser.h
@@ -103,7 +103,7 @@ int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
 // This function is used by both c and sse2 denoiser implementations.
 // Define it as a static function within the scope where av1_denoiser.h
 // is referenced.
-static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+static inline int total_adj_strong_thresh(BLOCK_SIZE bs,
                                           int increase_denoising) {
   return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index a2497571ee..8d4dd53de2 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -53,7 +53,7 @@
 #define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
 #define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
 
-static INLINE void write_uniform(aom_writer *w, int n, int v) {
+static inline void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return;
@@ -416,7 +416,7 @@ static AOM_INLINE void pack_txb_tokens(
   }
 }
 
-static INLINE void set_spatial_segment_id(
+static inline void set_spatial_segment_id(
     const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
     BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) {
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
@@ -1046,7 +1046,7 @@ static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
   write_filter_intra_mode_info(cm, xd, mbmi, w);
 }
 
-static INLINE int16_t mode_context_analyzer(
+static inline int16_t mode_context_analyzer(
     const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] <= INTRA_FRAME) return mode_context;
 
@@ -1058,7 +1058,7 @@ static INLINE int16_t mode_context_analyzer(
   return comp_ctx;
 }
 
-static INLINE int_mv get_ref_mv_from_stack(
+static inline int_mv get_ref_mv_from_stack(
     int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
     const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
   const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
@@ -1076,7 +1076,7 @@ static INLINE int_mv get_ref_mv_from_stack(
              : mbmi_ext_frame->global_mvs[ref_frame_type];
 }
 
-static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+static inline int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
   int ref_mv_idx = mbmi->ref_mv_idx;
@@ -2275,7 +2275,7 @@ static AOM_INLINE void write_ext_tile_info(
   }
 }
 
-static INLINE int find_identical_tile(
+static inline int find_identical_tile(
     const int tile_row, const int tile_col,
     TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
   const MV32 candidate_offset[1] = { { 1, 0 } };
@@ -3723,7 +3723,7 @@ static void write_large_scale_tile_obu(
 }
 
 // Packs information in the obu header for large scale tiles.
-static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
+static inline uint32_t pack_large_scale_tiles_in_tg_obus(
     AV1_COMP *const cpi, uint8_t *const dst,
     struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
     int *const largest_tile_id) {
@@ -4075,7 +4075,7 @@ static int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
   return ideal_num_workers;
 }
 
-static INLINE uint32_t pack_tiles_in_tg_obus(
+static inline uint32_t pack_tiles_in_tg_obus(
     AV1_COMP *const cpi, uint8_t *const dst,
     struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
     const FrameHeaderInfo *fh_info, int *const largest_tile_id) {
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 01e012a0f2..4e62312bd5 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1414,7 +1414,7 @@ typedef struct macroblock {
 // Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
 // It only zeroes out what is necessary in 'color_index_map' (just the block
 // size, not the whole array).
-static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+static inline void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
                                           WinnerModeStats *stats) {
   // When winner mode stats are not required, the memory allocation is avoided
   // for x->winner_mode_stats. The stats pointer will be NULL in such cases.
@@ -1436,7 +1436,7 @@ static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
   }
 }
 
-static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+static inline int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   static const char LUT[BLOCK_SIZES_ALL] = {
     0,  // BLOCK_4X4
     1,  // BLOCK_4X8
@@ -1465,13 +1465,13 @@ static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   return LUT[bsize];
 }
 
-static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+static inline int is_rect_tx_allowed(const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
   return is_rect_tx_allowed_bsize(mbmi->bsize) &&
          !xd->lossless[mbmi->segment_id];
 }
 
-static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+static inline int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
   TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
   int depth = 0;
   while (tx_size != ctx_size) {
@@ -1482,7 +1482,7 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
   return depth;
 }
 
-static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
+static inline void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
                                 int skip) {
   if (skip)
     txb_skip[blk_idx] |= 1UL << plane;
@@ -1501,7 +1501,7 @@ static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
 #endif
 }
 
-static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
+static inline int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
 #ifndef NDEBUG
   // Check if this is initialized
   assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c
index 6593597470..f1ae43cc83 100644
--- a/av1/encoder/cnn.c
+++ b/av1/encoder/cnn.c
@@ -31,9 +31,9 @@ typedef struct {
   int th_step;
 } CONVOLVE_OPS;
 
-static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
+static inline float softsign(float x) { return x / (fabsf(x) + 1.0f); }
 
-static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
+static inline float relu(float x) { return (x < 0) ? 0 : x; }
 
 typedef struct {
   int allocsize;
@@ -222,7 +222,7 @@ static void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
 }
 
 #if CONFIG_DEBUG
-static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+static inline int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
   const int num_layers = cnn_config->num_layers;
   const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
 
@@ -289,7 +289,7 @@ void av1_find_cnn_output_size(int in_width, int in_height,
   }
 }
 
-static INLINE int get_start_shift_convolve(int width, int filt_width,
+static inline int get_start_shift_convolve(int width, int filt_width,
                                            int stride) {
   const int mod = (width % stride);
   const int filt_off = (filt_width - 1) / 2;
@@ -755,7 +755,7 @@ static void convolve_layer_mt(const float **input, int in_width, int in_height,
   }
 }
 
-static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+static inline int get_start_shift_deconvolve(int filt_width, int stride) {
   const int dif = AOMMAX(filt_width - stride, 0);
   return dif / 2;
 }
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 99c3c6513d..5e6b923f64 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -25,7 +25,7 @@ typedef int64_t (*pick_interinter_mask_type)(
     uint64_t *best_sse);
 
 // Checks if characteristics of search match
-static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+static inline int is_comp_rd_match(const AV1_COMP *const cpi,
                                    const MACROBLOCK *const x,
                                    const COMP_RD_STATS *st,
                                    const MB_MODE_INFO *const mi,
@@ -79,7 +79,7 @@ static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
 
 // Checks if similar compound type search case is accounted earlier
 // If found, returns relevant rd data
-static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+static inline int find_comp_rd_in_stats(const AV1_COMP *const cpi,
                                         const MACROBLOCK *x,
                                         const MB_MODE_INFO *const mbmi,
                                         int32_t *comp_rate, int64_t *comp_dist,
@@ -97,21 +97,21 @@ static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
   return 0;  // no match result found
 }
 
-static INLINE bool enable_wedge_search(
+static inline bool enable_wedge_search(
     MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) {
   // Enable wedge search if source variance and edge strength are above
   // the thresholds.
   return x->source_variance > disable_wedge_var_thresh;
 }
 
-static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+static inline bool enable_wedge_interinter_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
   return enable_wedge_search(
              x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) &&
          cpi->oxcf.comp_type_cfg.enable_interinter_wedge;
 }
 
-static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+static inline bool enable_wedge_interintra_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
   return enable_wedge_search(
              x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) &&
@@ -458,7 +458,7 @@ static AOM_INLINE void get_inter_predictors_masked_compound(
 }
 
 // Computes the rd cost for the given interintra mode and updates the best
-static INLINE void compute_best_interintra_mode(
+static inline void compute_best_interintra_mode(
     const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
     MACROBLOCK *const x, const int *const interintra_mode_cost,
     const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
@@ -830,7 +830,7 @@ int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
 }
 
 // Computes the valid compound_types to be evaluated
-static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+static inline int compute_valid_comp_types(MACROBLOCK *x,
                                            const AV1_COMP *const cpi,
                                            BLOCK_SIZE bsize,
                                            int masked_compound_used,
@@ -873,7 +873,7 @@ static INLINE int compute_valid_comp_types(MACROBLOCK *x,
 }
 
 // Calculates the cost for compound type mask
-static INLINE void calc_masked_type_cost(
+static inline void calc_masked_type_cost(
     const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
     int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
   av1_zero_array(masked_type_cost, COMPOUND_TYPES);
@@ -901,7 +901,7 @@ static INLINE void calc_masked_type_cost(
 }
 
 // Updates mbmi structure with the relevant compound type info
-static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+static inline void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
                                                  COMPOUND_TYPE cur_type) {
   mbmi->interinter_comp.type = cur_type;
   mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
@@ -911,7 +911,7 @@ static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
 // When match is found, populate the compound type data
 // and calculate the rd cost using the stored stats and
 // update the mbmi appropriately.
-static INLINE int populate_reuse_comp_type_data(
+static inline int populate_reuse_comp_type_data(
     const MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
     int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
@@ -932,7 +932,7 @@ static INLINE int populate_reuse_comp_type_data(
 }
 
 // Updates rd cost and relevant compound type data for the best compound type
-static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+static inline void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
                                     BEST_COMP_TYPE_STATS *best_type_stats,
                                     int64_t best_rd_cur,
                                     int64_t comp_model_rd_cur, int rs2) {
@@ -943,7 +943,7 @@ static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
 }
 
 // Updates best_mv for masked compound types
-static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+static inline void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
                                        int_mv *best_mv, int *best_tmp_rate_mv,
                                        int tmp_rate_mv) {
   *best_tmp_rate_mv = tmp_rate_mv;
@@ -951,7 +951,7 @@ static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
   best_mv[1].as_int = mbmi->mv[1].as_int;
 }
 
-static INLINE void save_comp_rd_search_stat(
+static inline void save_comp_rd_search_stat(
     MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
     const int64_t *comp_dist, const int32_t *comp_model_rate,
     const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
@@ -980,7 +980,7 @@ static INLINE void save_comp_rd_search_stat(
   }
 }
 
-static INLINE int get_interinter_compound_mask_rate(
+static inline int get_interinter_compound_mask_rate(
     const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
   // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
@@ -998,7 +998,7 @@ static INLINE int get_interinter_compound_mask_rate(
 }
 
 // Takes a backup of rate, distortion and model_rd for future reuse
-static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+static inline void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
                                 int64_t *comp_dist, int32_t *comp_model_rate,
                                 int64_t *comp_model_dist, int rate_sum,
                                 int64_t dist_sum, RD_STATS *rd_stats,
@@ -1010,7 +1010,7 @@ static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
   comp_rs2[cur_type] = rs2;
 }
 
-static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
+static inline int save_mask_search_results(const PREDICTION_MODE this_mode,
                                            const int reuse_level) {
   if (reuse_level || (this_mode == NEW_NEWMV))
     return 1;
@@ -1018,7 +1018,7 @@ static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
     return 0;
 }
 
-static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
+static inline int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
                                         MACROBLOCK *x, MACROBLOCKD *xd,
                                         const BLOCK_SIZE bsize,
                                         int64_t ref_skip_rd, int mode_rate) {
diff --git a/av1/encoder/cost.h b/av1/encoder/cost.h
index 0333fd4630..e51e6b8d0a 100644
--- a/av1/encoder/cost.h
+++ b/av1/encoder/cost.h
@@ -29,7 +29,7 @@ extern const uint16_t av1_prob_cost[128];
 #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
 
 // Calculate the cost of a symbol with probability p15 / 2^15
-static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+static inline int av1_cost_symbol(aom_cdf_prob p15) {
   // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
   // following cost calculation works correctly. Otherwise, if p15 =
   // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index f9dd15ddab..d34a84c333 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -44,7 +44,7 @@
 
 #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
 
-static INLINE void set_refresh_frame_flags(
+static inline void set_refresh_frame_flags(
     RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
     bool refresh_arf) {
   refresh_frame->golden_frame = refresh_gf;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index de09b70887..80aa2d125c 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -583,7 +583,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
 }
 
 // This function initializes the stats for encode_rd_sb.
-static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+static inline void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
                                      const TileDataEnc *tile_data,
                                      SIMPLE_MOTION_DATA_TREE *sms_root,
                                      RD_STATS *rd_cost, int mi_row, int mi_col,
@@ -1551,7 +1551,7 @@ static AOM_INLINE void set_rel_frame_dist(
   }
 }
 
-static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
+static inline int refs_are_one_sided(const AV1_COMMON *cm) {
   assert(!frame_is_intra_only(cm));
 
   int one_sided_refs = 1;
@@ -1568,7 +1568,7 @@ static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
   return one_sided_refs;
 }
 
-static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+static inline void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
                                              int ref_order_hint[2]) {
   const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
   ref_order_hint[0] = ref_order_hint[1] = 0;
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 6e268e6b63..4201728f63 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -233,7 +233,7 @@ static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
 
 // This function will copy the best reference mode information from
 // MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
-static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+static inline void copy_mbmi_ext_frame_to_mbmi_ext(
     MB_MODE_INFO_EXT *mbmi_ext,
     const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
   memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 6f084eb938..477ed6f25f 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -538,7 +538,7 @@ static AOM_INLINE unsigned int get_num_refs_to_disable(
   return num_refs_to_disable;
 }
 
-static INLINE int get_max_allowed_ref_frames(
+static inline int get_max_allowed_ref_frames(
     const AV1_COMP *cpi, const int *ref_frame_flags,
     const unsigned int *ref_display_order_hint,
     unsigned int cur_frame_display_index) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 721210a206..5f539c48b0 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -138,7 +138,7 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
 
-static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+static inline void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
                                        TX_SIZE tx_size, ENTROPY_CONTEXT *a,
                                        ENTROPY_CONTEXT *l) {
   const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
@@ -153,7 +153,7 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
                                   TRELLIS_OPT_TYPE enable_optimize_b);
 
-static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+static inline int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
                                   RUN_TYPE dry_run) {
   if (optimize_b == NO_TRELLIS_OPT) return false;
   if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index f37cd5d13f..ded1fe20b1 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -46,7 +46,7 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp,
                                       int_mv *nearest_mv, int_mv *near_mv,
                                       int is_integer);
 
-static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+static inline MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
   // row:  Z  col:  Z  | MV_JOINT_ZERO   (0)
   // row:  Z  col: NZ  | MV_JOINT_HNZVZ  (1)
   // row: NZ  col:  Z  | MV_JOINT_HZVNZ  (2)
@@ -54,17 +54,17 @@ static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
   return (!!mv->col) | ((!!mv->row) << 1);
 }
 
-static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+static inline int av1_mv_class_base(MV_CLASS_TYPE c) {
   return c ? CLASS0_SIZE << (c + 2) : 0;
 }
 
 // If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
-static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+static inline uint8_t av1_log_in_base_2(unsigned int n) {
   // get_msb() is only valid when n != 0.
   return n == 0 ? 0 : get_msb(n);
 }
 
-static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+static inline MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
   assert(z >= 0);
   const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
   assert(c <= MV_CLASS_10);
@@ -72,7 +72,7 @@ static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
   return c;
 }
 
-static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+static inline int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
                                                 MACROBLOCK *const x) {
   (void)cm;
   MACROBLOCKD *xd = &x->e_mbd;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 352bb64c9a..f845f395cc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -106,7 +106,7 @@ FILE *yuv_rec_file;
 FILE *yuv_denoised_file = NULL;
 #endif
 
-static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
+static inline void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
     case AOME_NORMAL:
       *hr = 1;
@@ -376,7 +376,7 @@ void av1_update_frame_size(AV1_COMP *cpi) {
   set_tile_info(cm, &cpi->oxcf.tile_cfg);
 }
 
-static INLINE int does_level_match(int width, int height, double fps,
+static inline int does_level_match(int width, int height, double fps,
                                    int lvl_width, int lvl_height,
                                    double lvl_fps, int lvl_dim_mult) {
   const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
@@ -969,7 +969,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
 #endif  // CONFIG_REALTIME_ONLY
 }
 
-static INLINE void init_frame_info(FRAME_INFO *frame_info,
+static inline void init_frame_info(FRAME_INFO *frame_info,
                                    const AV1_COMMON *const cm) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const SequenceHeader *const seq_params = cm->seq_params;
@@ -985,11 +985,11 @@ static INLINE void init_frame_info(FRAME_INFO *frame_info,
   frame_info->subsampling_y = seq_params->subsampling_y;
 }
 
-static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
+static inline void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
   frame_index_set->show_frame_count = 0;
 }
 
-static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) {
+static inline void update_counters_for_show_frame(AV1_COMP *const cpi) {
   assert(cpi->common.show_frame);
   cpi->frame_index_set.show_frame_count++;
   cpi->common.current_frame.frame_number++;
@@ -2269,7 +2269,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static INLINE int extend_borders_mt(const AV1_COMP *cpi,
+static inline int extend_borders_mt(const AV1_COMP *cpi,
                                     MULTI_THREADED_MODULES stage, int plane) {
   const AV1_COMMON *const cm = &cpi->common;
   if (cpi->mt_info.num_mod_workers[stage] < 2) return 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index e071b0496b..ef2d53ad8f 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -136,7 +136,7 @@ enum {
 // 0 level frames are sometimes used for rate control purposes, but for
 // reference mapping purposes, the minimum level should be 1.
 #define MIN_PYR_LEVEL 1
-static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+static inline int get_true_pyr_level(int frame_level, int frame_order,
                                      int max_layer_depth) {
   if (frame_order == 0) {
     // Keyframe case
@@ -1100,7 +1100,7 @@ typedef struct AV1EncoderConfig {
 } AV1EncoderConfig;
 
 /*!\cond */
-static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+static inline int is_lossless_requested(const RateControlCfg *const rc_cfg) {
   return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
 }
 /*!\endcond */
@@ -1980,7 +1980,7 @@ enum {
   kTimingComponents,
 } UENUM1BYTE(TIMING_COMPONENT);
 
-static INLINE char const *get_component_name(int index) {
+static inline char const *get_component_name(int index) {
   switch (index) {
     case av1_encode_strategy_time: return "av1_encode_strategy_time";
     case av1_get_one_pass_rt_params_time:
@@ -3911,7 +3911,7 @@ typedef struct {
   int disp_order;
 } RefFrameMapPair;
 
-static INLINE void init_ref_map_pair(
+static inline void init_ref_map_pair(
     AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
   if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
     memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
@@ -3974,19 +3974,19 @@ static AOM_INLINE void calc_frame_data_update_flag(
 // av1 uses 10,000,000 ticks/second as time stamp
 #define TICKS_PER_SEC 10000000LL
 
-static INLINE int64_t
-timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+static inline int64_t timebase_units_to_ticks(
+    const aom_rational64_t *timestamp_ratio, int64_t n) {
   return n * timestamp_ratio->num / timestamp_ratio->den;
 }
 
-static INLINE int64_t
-ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+static inline int64_t ticks_to_timebase_units(
+    const aom_rational64_t *timestamp_ratio, int64_t n) {
   int64_t round = timestamp_ratio->num / 2;
   if (round > 0) --round;
   return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
 }
 
-static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+static inline int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const FRAME_UPDATE_TYPE update_type =
       gf_group->update_type[cpi->gf_frame_index];
@@ -3996,19 +3996,19 @@ static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
 }
 
 // TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
-static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+static inline int av1_use_hash_me(const AV1_COMP *const cpi) {
   return (cpi->common.features.allow_screen_content_tools &&
           cpi->common.features.allow_intrabc &&
           frame_is_intra_only(&cpi->common));
 }
 
-static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+static inline const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
     const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
   const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
   return buf != NULL ? &buf->buf : NULL;
 }
 
-static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+static inline void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
   assert(buf != NULL);
   ensure_mv_buffer(buf, cm);
   buf->width = cm->width;
@@ -4017,7 +4017,7 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
 
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE unsigned int allocated_tokens(const TileInfo *tile,
+static inline unsigned int allocated_tokens(const TileInfo *tile,
                                             int sb_size_log2, int num_planes) {
   int tile_mb_rows =
       ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2);
@@ -4027,7 +4027,7 @@ static INLINE unsigned int allocated_tokens(const TileInfo *tile,
   return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
 
-static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+static inline void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
                                  int mi_row, TokenExtra **tok, int sb_size_log2,
                                  int num_planes) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4046,7 +4046,7 @@ static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
 #define ALT_MIN_LAG 3
-static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+static inline int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
   return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
 }
 
@@ -4056,24 +4056,24 @@ static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
 }
 
 // Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+static inline int get_num_blocks(const int frame_length, const int mb_length) {
   return (frame_length + mb_length - 1) / mb_length;
 }
 
 // Check if statistics generation stage
-static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+static inline int is_stat_generation_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
                  cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
   return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
           (cpi->compressor_stage == LAP_STAGE));
 }
 // Check if statistics consumption stage
-static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+static inline int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
   return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
 }
 
 // Check if statistics consumption stage
-static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+static inline int is_stat_consumption_stage(const AV1_COMP *const cpi) {
   return (is_stat_consumption_stage_twopass(cpi) ||
           (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
            (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
@@ -4094,7 +4094,7 @@ static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
  *
  * \return 0 if no stats for current stage else 1
  */
-static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+static inline int has_no_stats_stage(const AV1_COMP *const cpi) {
   assert(
       IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
   return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
@@ -4102,20 +4102,20 @@ static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
 
 /*!\cond */
 
-static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+static inline int is_one_pass_rt_params(const AV1_COMP *cpi) {
   return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
          cpi->oxcf.gf_cfg.lag_in_frames == 0;
 }
 
 // Use default/internal reference structure for single-layer RTC.
-static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
+static inline int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
   return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 &&
          cpi->ppi->number_temporal_layers == 1 &&
          !cpi->ppi->rtc_ref.set_ref_frame_config;
 }
 
 // Check if postencode drop is allowed.
-static INLINE int allow_postencode_drop_rtc(const AV1_COMP *cpi) {
+static inline int allow_postencode_drop_rtc(const AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   return is_one_pass_rt_params(cpi) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
          cpi->oxcf.rc_cfg.drop_frames_water_mark > 0 &&
@@ -4124,14 +4124,14 @@ static INLINE int allow_postencode_drop_rtc(const AV1_COMP *cpi) {
 }
 
 // Function return size of frame stats buffer
-static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+static inline int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
   return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
 }
 
 // TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
 
-static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+static inline void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
   xd->block_ref_scale_factors[0] =
@@ -4140,18 +4140,18 @@ static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
       get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
 }
 
-static INLINE int get_chessboard_index(int frame_index) {
+static inline int get_chessboard_index(int frame_index) {
   return frame_index & 0x1;
 }
 
-static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+static inline const int *cond_cost_list_const(const struct AV1_COMP *cpi,
                                               const int *cost_list) {
   const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
                             cpi->sf.mv_sf.use_fullpel_costlist;
   return use_cost_list ? cost_list : NULL;
 }
 
-static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+static inline int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
   const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
                             cpi->sf.mv_sf.use_fullpel_costlist;
   return use_cost_list ? cost_list : NULL;
@@ -4168,26 +4168,26 @@ void av1_setup_frame_size(AV1_COMP *cpi);
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
 // Returns 1 if a frame is scaled and 0 otherwise.
-static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+static inline int av1_resize_scaled(const AV1_COMMON *cm) {
   return cm->superres_upscaled_width != cm->render_width ||
          cm->superres_upscaled_height != cm->render_height;
 }
 
-static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+static inline int av1_frame_scaled(const AV1_COMMON *cm) {
   return av1_superres_scaled(cm) || av1_resize_scaled(cm);
 }
 
 // Don't allow a show_existing_frame to coincide with an error resilient
 // frame. An exception can be made for a forward keyframe since it has no
 // previous dependencies.
-static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+static inline int encode_show_existing_frame(const AV1_COMMON *cm) {
   return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
                                      cm->current_frame.frame_type == KEY_FRAME);
 }
 
 // Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
 // 'mi_row' and 'mi_col'.
-static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+static inline int get_mi_ext_idx(const int mi_row, const int mi_col,
                                  const BLOCK_SIZE mi_alloc_bsize,
                                  const int mbmi_ext_stride) {
   const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
@@ -4198,7 +4198,7 @@ static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
 
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_mode_info_offsets(
+static inline void set_mode_info_offsets(
     const CommonModeInfoParams *const mi_params,
     const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
     MACROBLOCKD *const xd, int mi_row, int mi_col) {
@@ -4211,7 +4211,7 @@ static INLINE void set_mode_info_offsets(
 // Check to see if the given partition size is allowed for a specified number
 // of mi block rows and columns remaining in the image.
 // If not then return the largest allowed partition size
-static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+static inline BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
                                              int cols_left, int *bh, int *bw) {
   int int_size = (int)bsize;
   if (rows_left <= 0 || cols_left <= 0) {
@@ -4252,7 +4252,7 @@ static const MV_REFERENCE_FRAME
       ALTREF2_FRAME, LAST2_FRAME,  LAST3_FRAME,
     };
 
-static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+static inline int get_ref_frame_flags(const SPEED_FEATURES *const sf,
                                       const int use_one_pass_rt_params,
                                       const YV12_BUFFER_CONFIG **ref_frames,
                                       const int ext_ref_frame_flags) {
@@ -4299,14 +4299,14 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
 #define MAX_GFUBOOST_FACTOR 10.0
 #define MIN_GFUBOOST_FACTOR 4.0
 
-static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+static inline int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
                                         uint8_t index) {
   const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
   return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
          update_type == KF_UPDATE;
 }
 
-static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+static inline int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
                                                     int selective_ref_frame,
                                                     int prune_ref_frames,
                                                     int gf_index) {
@@ -4315,12 +4315,12 @@ static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
 }
 
 // Get update type of the current frame.
-static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+static inline FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
                                                       int gf_frame_index) {
   return gf_group->update_type[gf_frame_index];
 }
 
-static INLINE int av1_pixels_to_mi(int pixels) {
+static inline int av1_pixels_to_mi(int pixels) {
   return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
 }
 
@@ -4331,7 +4331,7 @@ static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
          cm->show_frame && !cpi->is_dropped_frame;
 }
 
-static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
+static inline int is_frame_resize_pending(const AV1_COMP *const cpi) {
   const ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
   return (resize_pending_params->width && resize_pending_params->height &&
@@ -4340,18 +4340,18 @@ static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
 }
 
 // Check if loop filter is used.
-static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) {
+static inline int is_loopfilter_used(const AV1_COMMON *const cm) {
   return !cm->features.coded_lossless && !cm->tiles.large_scale;
 }
 
 // Check if CDEF is used.
-static INLINE int is_cdef_used(const AV1_COMMON *const cm) {
+static inline int is_cdef_used(const AV1_COMMON *const cm) {
   return cm->seq_params->enable_cdef && !cm->features.coded_lossless &&
          !cm->tiles.large_scale;
 }
 
 // Check if loop restoration filter is used.
-static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+static inline int is_restoration_used(const AV1_COMMON *const cm) {
   return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
          !cm->tiles.large_scale;
 }
@@ -4361,7 +4361,7 @@ static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
 // filters on the reconstructed frame can be skipped at the encoder side.
 // However the computation of different filter parameters that are signaled in
 // the bitstream is still required.
-static INLINE unsigned int derive_skip_apply_postproc_filters(
+static inline unsigned int derive_skip_apply_postproc_filters(
     const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres,
     int use_restoration) {
   // Though CDEF parameter selection should be dependent on
@@ -4401,7 +4401,7 @@ static INLINE unsigned int derive_skip_apply_postproc_filters(
   return 0;
 }
 
-static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
+static inline void set_postproc_filter_default_params(AV1_COMMON *cm) {
   struct loopfilter *const lf = &cm->lf;
   CdefInfo *const cdef_info = &cm->cdef_info;
   RestorationInfo *const rst_info = cm->rst_info;
@@ -4417,13 +4417,13 @@ static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
   rst_info[2].frame_restoration_type = RESTORE_NONE;
 }
 
-static INLINE int is_inter_tx_size_search_level_one(
+static inline int is_inter_tx_size_search_level_one(
     const TX_SPEED_FEATURES *tx_sf) {
   return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
           tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
 }
 
-static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
+static inline int get_lpf_opt_level(const SPEED_FEATURES *sf) {
   int lpf_opt_level = 0;
   if (is_inter_tx_size_search_level_one(&sf->tx_sf))
     lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
@@ -4431,13 +4431,13 @@ static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
 }
 
 // Enable switchable motion mode only if warp and OBMC tools are allowed
-static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
+static inline bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
                                                      bool enable_obmc) {
   return (allow_warped_motion || enable_obmc);
 }
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
-static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
+static inline int denoise_svc(const struct AV1_COMP *const cpi) {
   return (!cpi->ppi->use_svc ||
           (cpi->ppi->use_svc &&
            cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
@@ -4445,7 +4445,7 @@ static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
 #endif
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
-static INLINE void av1_print_fr_partition_timing_stats(
+static inline void av1_print_fr_partition_timing_stats(
     const FramePartitionTimingStats *part_stats, const char *filename) {
   FILE *f = fopen(filename, "w");
   if (!f) {
@@ -4484,7 +4484,7 @@ static INLINE void av1_print_fr_partition_timing_stats(
 #endif  // CONFIG_COLLECT_PARTITION_STATS == 2
 
 #if CONFIG_COLLECT_PARTITION_STATS
-static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+static inline int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
   assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
          bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
          bsize == BLOCK_4X4);
@@ -4501,15 +4501,15 @@ static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
 #endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-static INLINE void start_timing(AV1_COMP *cpi, int component) {
+static inline void start_timing(AV1_COMP *cpi, int component) {
   aom_usec_timer_start(&cpi->component_timer[component]);
 }
-static INLINE void end_timing(AV1_COMP *cpi, int component) {
+static inline void end_timing(AV1_COMP *cpi, int component) {
   aom_usec_timer_mark(&cpi->component_timer[component]);
   cpi->frame_component_time[component] +=
       aom_usec_timer_elapsed(&cpi->component_timer[component]);
 }
-static INLINE char const *get_frame_type_enum(int type) {
+static inline char const *get_frame_type_enum(int type) {
   switch (type) {
     case 0: return "KEY_FRAME";
     case 1: return "INTER_FRAME";
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 32d67463fb..0f188e1e38 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -242,7 +242,7 @@ static void update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
   }
 }
 
-static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+static inline int get_nz_map_ctx(const uint8_t *const levels,
                                  const int coeff_idx, const int bhl,
                                  const int width, const int scan_idx,
                                  const int is_eob, const TX_SIZE tx_size,
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index b9c2031027..5e711e522c 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -251,7 +251,7 @@ CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
  * \param[in]    plane          The index of the current plane
  * \param[in]    tx_size        The transform size
  */
-static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+static inline int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
                                     const TXB_CTX *const txb_ctx, int plane,
                                     TX_SIZE tx_size) {
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 9d151cb254..c497eeba60 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -399,7 +399,7 @@ typedef struct intra_pred_block_pass1_args {
   MACROBLOCK *x;
 } intra_pred_block_pass1_args;
 
-static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+static inline void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
                              int sstride, int width, int height, int use_hbd) {
 #if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index 75ea618d24..0832494ee1 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -548,13 +548,13 @@ struct EncodeFrameParams;
 struct AV1EncoderConfig;
 struct TileDataEnc;
 
-static INLINE int is_fp_wavelet_energy_invalid(
+static inline int is_fp_wavelet_energy_invalid(
     const FIRSTPASS_STATS *fp_stats) {
   assert(fp_stats != NULL);
   return (fp_stats->frame_avg_wavelet_energy < 0);
 }
 
-static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+static inline BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
   return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
 }
 
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 7a98c8c183..4d216d051e 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -110,7 +110,7 @@ static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+static inline int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
                                      const uint16_t *const dst, int dst_stride,
                                      int p_width, int p_height) {
   // This function should only be called for patches smaller than
@@ -215,7 +215,7 @@ static int64_t highbd_warp_error(WarpedMotionParams *wm,
 }
 #endif
 
-static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+static inline int generic_sad(const uint8_t *const ref, int ref_stride,
                               const uint8_t *const dst, int dst_stride,
                               int p_width, int p_height) {
   // This function should only be called for patches smaller than
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 54f2c7f97e..4b7fec5b27 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -313,7 +313,7 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+static inline void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
                                        ptrdiff_t src_stride,
                                        tran_low_t *coeff) {
   switch (tx_size) {
@@ -333,7 +333,7 @@ static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+static inline void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
                                 ptrdiff_t src_stride, tran_low_t *coeff) {
   switch (tx_size) {
     case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index d64cae45dc..8011f3c0d1 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -16,7 +16,7 @@
 #include "av1/encoder/reconinter_enc.h"
 
 // return mv_diff
-static INLINE int is_interp_filter_good_match(
+static inline int is_interp_filter_good_match(
     const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
     int skip_level) {
   const int is_comp = has_second_ref(mi);
@@ -39,7 +39,7 @@ static INLINE int is_interp_filter_good_match(
   return mv_diff;
 }
 
-static INLINE int save_interp_filter_search_stat(
+static inline int save_interp_filter_search_stat(
     MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
     INTERPOLATION_FILTER_STATS *interp_filter_stats,
     int interp_filter_stats_idx) {
@@ -58,7 +58,7 @@ static INLINE int save_interp_filter_search_stat(
   return interp_filter_stats_idx;
 }
 
-static INLINE int find_interp_filter_in_stats(
+static inline int find_interp_filter_in_stats(
     MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
     int interp_filter_stats_idx, int skip_level) {
   // [skip_levels][single or comp]
@@ -106,7 +106,7 @@ int av1_find_interp_filter_match(
   return match_found_idx;
 }
 
-static INLINE int get_switchable_rate(MACROBLOCK *const x,
+static inline int get_switchable_rate(MACROBLOCK *const x,
                                       const int_interpfilters filters,
                                       const int ctx[2], int dual_filter) {
   const InterpFilter filter0 = filters.as_filters.y_filter;
@@ -121,7 +121,7 @@ static INLINE int get_switchable_rate(MACROBLOCK *const x,
 
 // Build inter predictor and calculate model rd
 // for a given plane.
-static INLINE void interp_model_rd_eval(
+static inline void interp_model_rd_eval(
     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
     RD_STATS *rd_stats, int is_skip_build_pred) {
@@ -149,7 +149,7 @@ static INLINE void interp_model_rd_eval(
 }
 
 // calculate the rdcost of given interpolation_filter
-static INLINE int64_t interpolation_filter_rd(
+static inline int64_t interpolation_filter_rd(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
     const BUFFER_SET *const orig_dst, int64_t *const rd,
@@ -266,7 +266,7 @@ static INLINE int64_t interpolation_filter_rd(
   return 0;
 }
 
-static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+static inline INTERP_PRED_TYPE is_pred_filter_search_allowed(
     const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
     int_interpfilters *af, int_interpfilters *lf) {
   const AV1_COMMON *cm = &cpi->common;
@@ -335,7 +335,7 @@ static DUAL_FILTER_TYPE find_best_interp_rd_facade(
   return best_filt_type;
 }
 
-static INLINE void pred_dual_interp_filter_rd(
+static inline void pred_dual_interp_filter_rd(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
     const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
@@ -372,7 +372,7 @@ static INLINE void pred_dual_interp_filter_rd(
 // a) Using above, left block interp filter
 // b) Find the best horizontal filter and
 //    then evaluate corresponding vertical filters.
-static INLINE void fast_dual_interp_filter_rd(
+static inline void fast_dual_interp_filter_rd(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
     const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
@@ -425,7 +425,7 @@ static INLINE void fast_dual_interp_filter_rd(
 }
 
 // Find the best interp filter if dual_interp_filter = 0
-static INLINE void find_best_non_dual_interp_filter(
+static inline void find_best_non_dual_interp_filter(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
     const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
@@ -562,7 +562,7 @@ static INLINE void find_best_non_dual_interp_filter(
   }
 }
 
-static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+static inline void calc_interp_skip_pred_flag(MACROBLOCK *const x,
                                               const AV1_COMP *const cpi,
                                               int *skip_hor, int *skip_ver) {
   const AV1_COMMON *cm = &cpi->common;
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index c4f914f4b6..3e9ae6bac3 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -1213,7 +1213,7 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
  *
  * \remark Returns nothing, but updates the mbmi and rd_stats.
  */
-static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+static inline void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
                                             BLOCK_SIZE bsize,
                                             const PICK_MODE_CONTEXT *ctx,
                                             RD_STATS *rd_stats_y, int mode_cost,
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 6dc9826e95..7940757213 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -403,7 +403,7 @@ static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
   normalize_hog(total, hist);
 }
 
-static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+static inline void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
                                     BLOCK_SIZE sb_size, int plane, float *hog) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/av1/encoder/level.h b/av1/encoder/level.h
index 9c1dadd211..d9d642fb04 100644
--- a/av1/encoder/level.h
+++ b/av1/encoder/level.h
@@ -166,7 +166,7 @@ typedef struct AV1LevelParams {
   AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
 } AV1LevelParams;
 
-static INLINE int is_in_operating_point(int operating_point,
+static inline int is_in_operating_point(int operating_point,
                                         int temporal_layer_id,
                                         int spatial_layer_id) {
   if (!operating_point) return 1;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 94fd17e4fa..cfe1dddebf 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -32,7 +32,7 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 
-static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+static inline void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
                                        const MvCosts *mv_costs,
                                        const MV *ref_mv, int errorperbit,
                                        int sadperbit) {
@@ -51,7 +51,7 @@ static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
   }
 }
 
-static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+static inline void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
   ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
   ms_buffers->src = &x->plane[0].src;
 
@@ -250,7 +250,7 @@ int av1_init_search_range(int size) {
 // joint_cost and comp_cost. joint_costs covers the cost of transmitting
 // JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
 // vector.
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+static inline int mv_cost(const MV *mv, const int *joint_cost,
                           const int *const comp_cost[2]) {
   return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
          comp_cost[1][mv->col];
@@ -271,7 +271,7 @@ int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
 // Returns the cost of using the current mv during the motion search. This is
 // used when var is used as the error metric.
 #define PIXEL_TRANSFORM_ERROR_SCALE 4
-static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+static inline int mv_err_cost(const MV *mv, const MV *ref_mv,
                               const int *mvjcost, const int *const mvcost[2],
                               int error_per_bit, MV_COST_TYPE mv_cost_type) {
   const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
@@ -297,7 +297,7 @@ static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
   }
 }
 
-static INLINE int mv_err_cost_(const MV *mv,
+static inline int mv_err_cost_(const MV *mv,
                                const MV_COST_PARAMS *mv_cost_params) {
   if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
     return 0;
@@ -310,7 +310,7 @@ static INLINE int mv_err_cost_(const MV *mv,
 // Returns the cost of using the current mv during the motion search. This is
 // only used during full pixel motion search when sad is used as the error
 // metric
-static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+static inline int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
                                  const int *mvjcost, const int *const mvcost[2],
                                  int sad_per_bit, MV_COST_TYPE mv_cost_type) {
   const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
@@ -333,7 +333,7 @@ static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
   }
 }
 
-static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+static inline int mvsad_err_cost_(const FULLPEL_MV *mv,
                                   const MV_COST_PARAMS *mv_cost_params) {
   return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
                         mv_cost_params->mvjcost, mv_cost_params->mvcost,
@@ -645,7 +645,7 @@ const av1_init_search_site_config
     };
 
 // Checks whether the mv is within range of the mv_limits
-static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
+static inline int check_bounds(const FullMvLimits *mv_limits, int row, int col,
                                int range) {
   return ((row - range) >= mv_limits->row_min) &
          ((row + range) <= mv_limits->row_max) &
@@ -653,7 +653,7 @@ static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
          ((col + range) <= mv_limits->col_max);
 }
 
-static INLINE int get_mvpred_var_cost(
+static inline int get_mvpred_var_cost(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
     FULLPEL_MV_STATS *mv_stats) {
   const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
@@ -676,7 +676,7 @@ static INLINE int get_mvpred_var_cost(
   return bestsme;
 }
 
-static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+static inline int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                                  const struct buf_2d *const src,
                                  const uint8_t *const ref_address,
                                  const int ref_stride) {
@@ -686,7 +686,7 @@ static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
 }
 
-static INLINE int get_mvpred_compound_var_cost(
+static inline int get_mvpred_compound_var_cost(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
     FULLPEL_MV_STATS *mv_stats) {
   const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
@@ -722,7 +722,7 @@ static INLINE int get_mvpred_compound_var_cost(
   return bestsme;
 }
 
-static INLINE int get_mvpred_compound_sad(
+static inline int get_mvpred_compound_sad(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const struct buf_2d *const src, const uint8_t *const ref_address,
     const int ref_stride) {
@@ -1476,7 +1476,7 @@ static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
 #undef UPDATE_SEARCH_STEP
 }
 
-static INLINE unsigned int get_start_mvpred_sad_cost(
+static inline unsigned int get_start_mvpred_sad_cost(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) {
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -2269,7 +2269,7 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 // =============================================================================
 //  Fullpixel Motion Search: OBMC
 // =============================================================================
-static INLINE int get_obmc_mvpred_var(
+static inline int get_obmc_mvpred_var(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
   const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -2455,11 +2455,11 @@ int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
  */
 
 // Returns the subpel offset used by various subpel variance functions [m]sv[a]f
-static INLINE int get_subpel_part(int x) { return x & 7; }
+static inline int get_subpel_part(int x) { return x & 7; }
 
 // Gets the address of the ref buffer at subpel location (r, c), rounded to the
 // nearest fullpel precision toward - \infty
-static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+static inline const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV mv) {
   const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
   return &buf->buf[offset];
@@ -2467,7 +2467,7 @@ static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
 
 // Estimates the variance of prediction residue using bilinear filter for fast
 // search.
-static INLINE int estimated_pref_error(
+static inline int estimated_pref_error(
     const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     unsigned int *sse) {
   const aom_variance_fn_ptr_t *vfp = var_params->vfp;
@@ -2592,7 +2592,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
 // Estimates whether this_mv is better than best_mv. This function incorporates
 // both prediction error and residue into account. It is suffixed "fast" because
 // it uses bilinear filter to estimate the prediction.
-static INLINE unsigned int check_better_fast(
+static inline unsigned int check_better_fast(
     MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
     const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
@@ -2649,7 +2649,7 @@ static AOM_FORCE_INLINE unsigned int check_better(
   return cost;
 }
 
-static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+static inline MV get_best_diag_step(int step_size, unsigned int left_cost,
                                     unsigned int right_cost,
                                     unsigned int up_cost,
                                     unsigned int down_cost) {
@@ -2949,11 +2949,11 @@ static unsigned int upsampled_setup_center_error(
   return besterr;
 }
 
-static INLINE int divide_and_round(int n, int d) {
+static inline int divide_and_round(int n, int d) {
   return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
 }
 
-static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+static inline int is_cost_list_wellbehaved(const int *cost_list) {
   return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
          cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
 }
@@ -2977,7 +2977,7 @@ static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
 // Checks the list of mvs searched in the last iteration and see if we are
 // repeating it. If so, return 1. Otherwise we update the last_mv_search_list
 // with current_mv and return 0.
-static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+static inline int check_repeated_mv_and_update(int_mv *last_mv_search_list,
                                                const MV current_mv, int iter) {
   if (last_mv_search_list) {
     if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
@@ -3388,7 +3388,7 @@ int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
 // during motion_mode_rd. We are going through the whole
 // av1_enc_build_inter_predictor because we might have changed the interpolation
 // filter, etc before motion_mode_rd is called.
-static INLINE unsigned int compute_motion_cost(
+static inline unsigned int compute_motion_cost(
     MACROBLOCKD *xd, const AV1_COMMON *const cm,
     const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
     const MV *this_mv) {
@@ -3595,7 +3595,7 @@ unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
 //  Subpixel Motion Search: OBMC
 // =============================================================================
 // Estimates the variance of prediction residue
-static INLINE int estimate_obmc_pref_error(
+static inline int estimate_obmc_pref_error(
     const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     unsigned int *sse) {
   const aom_variance_fn_ptr_t *vfp = var_params->vfp;
@@ -3689,7 +3689,7 @@ static unsigned int upsampled_setup_obmc_center_error(
 // Estimates the variance of prediction residue
 // TODO(chiyotsai@google.com): the cost does does not match the cost in
 // mv_cost_. Investigate this later.
-static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+static inline int estimate_obmc_mvcost(const MV *this_mv,
                                        const MV_COST_PARAMS *mv_cost_params) {
   const MV *ref_mv = mv_cost_params->ref_mv;
   const int *mvjcost = mv_cost_params->mvjcost;
@@ -3715,7 +3715,7 @@ static INLINE int estimate_obmc_mvcost(const MV *this_mv,
 
 // Estimates whether this_mv is better than best_mv. This function incorporates
 // both prediction error and residue into account.
-static INLINE unsigned int obmc_check_better_fast(
+static inline unsigned int obmc_check_better_fast(
     const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
@@ -3743,7 +3743,7 @@ static INLINE unsigned int obmc_check_better_fast(
 
 // Estimates whether this_mv is better than best_mv. This function incorporates
 // both prediction error and residue into account.
-static INLINE unsigned int obmc_check_better(
+static inline unsigned int obmc_check_better(
     MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
     const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
@@ -3952,7 +3952,7 @@ int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
   return sse + mv_err_cost_(&mv, mv_cost_params);
 }
 
-static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+static inline int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
                                     const FULLPEL_MV best_mv,
                                     const uint8_t *second_pred,
                                     const aom_variance_fn_ptr_t *vfp,
@@ -3966,7 +3966,7 @@ static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
          mv_err_cost_(&mv, mv_cost_params);
 }
 
-static INLINE int get_mvpred_mask_var(
+static inline int get_mvpred_mask_var(
     const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
     const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
     int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 7dd32e0a71..e91e15c0b6 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -83,7 +83,7 @@ typedef struct {
   const int32_t *obmc_mask;
 } MSBuffers;
 
-static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+static inline void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
                                             const uint8_t *second_pred,
                                             const uint8_t *mask,
                                             int mask_stride, int invert_mask) {
@@ -214,7 +214,7 @@ static AOM_INLINE void av1_refresh_search_site_config(
 }
 
 // Mv beyond the range do not produce new/different prediction block.
-static INLINE void av1_set_mv_search_method(
+static inline void av1_set_mv_search_method(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     SEARCH_METHODS search_method) {
@@ -225,7 +225,7 @@ static INLINE void av1_set_mv_search_method(
 
 // Set up limit values for MV components.
 // Mv beyond the range do not produce new/different prediction block.
-static INLINE void av1_set_mv_row_limits(
+static inline void av1_set_mv_row_limits(
     const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
     int mi_row, int mi_height, int border) {
   const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
@@ -238,7 +238,7 @@ static INLINE void av1_set_mv_row_limits(
   mv_limits->row_max = AOMMIN(max1, max2);
 }
 
-static INLINE void av1_set_mv_col_limits(
+static inline void av1_set_mv_col_limits(
     const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
     int mi_col, int mi_width, int border) {
   const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
@@ -251,7 +251,7 @@ static INLINE void av1_set_mv_col_limits(
   mv_limits->col_max = AOMMIN(max1, max2);
 }
 
-static INLINE void av1_set_mv_limits(
+static inline void av1_set_mv_limits(
     const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
     int mi_row, int mi_col, int mi_height, int mi_width, int border) {
   av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
@@ -288,7 +288,7 @@ int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
                                const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                                const int step_param, FULLPEL_MV *best_mv);
 
-static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+static inline int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
                                          FULLPEL_MV mv) {
   return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
          (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
@@ -355,13 +355,13 @@ unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                   WARP_SEARCH_METHOD search_method,
                                   int num_iterations);
 
-static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+static inline void av1_set_fractional_mv(int_mv *fractional_best_mv) {
   for (int z = 0; z < 3; z++) {
     fractional_best_mv[z].as_int = INVALID_MV;
   }
 }
 
-static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+static inline void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
                                                   const FullMvLimits *mv_limits,
                                                   const MV *ref_mv) {
   const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
@@ -379,17 +379,17 @@ static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
   subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
 }
 
-static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+static inline int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
                                            MV mv) {
   return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
          (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
 }
 
-static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+static inline int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
   return mv->row * stride + mv->col;
 }
 
-static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+static inline const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
                                                  const FULLPEL_MV *mv) {
   return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
 }
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index e36f7e8e9c..b03ed34c32 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -45,7 +45,7 @@ static int use_fine_search_interval(const AV1_COMP *const cpi) {
 }
 
 // Iterate through the tpl and collect the mvs to be used as candidates
-static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+static inline void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
                                              const MACROBLOCK *x,
                                              BLOCK_SIZE bsize, int ref,
                                              cand_mv_t *cand, int *cand_count,
diff --git a/av1/encoder/nonrd_opt.c b/av1/encoder/nonrd_opt.c
index bcda2f0799..909c67ca38 100644
--- a/av1/encoder/nonrd_opt.c
+++ b/av1/encoder/nonrd_opt.c
@@ -57,7 +57,7 @@ static AOM_FORCE_INLINE void update_yrd_loop_vars(
   this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
 }
 
-static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
+static inline void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
                                                 int max_blocks_high,
                                                 int max_blocks_wide,
                                                 int num_4x4_w, int step,
@@ -562,7 +562,7 @@ static void compute_intra_yprediction(const AV1_COMMON *cm,
 // Checks whether Intra mode needs to be pruned based on
 // 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
 // speed features.
-static INLINE bool is_prune_intra_mode(
+static inline bool is_prune_intra_mode(
     AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize,
     uint8_t segment_id, SOURCE_SAD source_sad_nonrd,
     uint8_t color_sensitivity[MAX_MB_PLANE - 1]) {
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index eae0be059b..37661ed480 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -392,7 +392,7 @@ DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = {
 };
 
 // Indicates the blocks for which RD model should be based on special logic
-static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+static inline int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
                                     BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const int large_block = bsize >= BLOCK_32X32;
@@ -426,7 +426,7 @@ static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
  * \remark Nothing is returned. Instead, predicted MVs are placed into
  * \c frame_mv array, and use_scaled_ref_frame is set.
  */
-static INLINE void find_predictors(
+static inline void find_predictors(
     AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
     struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
@@ -477,7 +477,7 @@ static INLINE void find_predictors(
   *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
 }
 
-static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
+static inline void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
                                    PREDICTION_MODE pred_mode,
                                    MV_REFERENCE_FRAME ref_frame0,
                                    MV_REFERENCE_FRAME ref_frame1,
@@ -498,7 +498,7 @@ static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
   set_default_interp_filters(mbmi, cm->features.interp_filter);
 }
 
-static INLINE void init_estimate_block_intra_args(
+static inline void init_estimate_block_intra_args(
     struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) {
   args->cpi = cpi;
   args->x = x;
@@ -509,7 +509,7 @@ static INLINE void init_estimate_block_intra_args(
   args->prune_mode_based_on_sad = false;
 }
 
-static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) {
+static inline int get_pred_buffer(PRED_BUFFER *p, int len) {
   for (int buf_idx = 0; buf_idx < len; buf_idx++) {
     if (!p[buf_idx].in_use) {
       p[buf_idx].in_use = 1;
@@ -519,16 +519,16 @@ static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) {
   return -1;
 }
 
-static INLINE void free_pred_buffer(PRED_BUFFER *p) {
+static inline void free_pred_buffer(PRED_BUFFER *p) {
   if (p != NULL) p->in_use = 0;
 }
 
 #if CONFIG_INTERNAL_STATS
-static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+static inline void store_coding_context_nonrd(MACROBLOCK *x,
                                               PICK_MODE_CONTEXT *ctx,
                                               int mode_index) {
 #else
-static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+static inline void store_coding_context_nonrd(MACROBLOCK *x,
                                               PICK_MODE_CONTEXT *ctx) {
 #endif  // CONFIG_INTERNAL_STATS
   MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 7fdc546b2c..a856c95941 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -27,7 +27,7 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
-static INLINE int early_term_inter_search_with_sse(int early_term_idx,
+static inline int early_term_inter_search_with_sse(int early_term_idx,
                                                    BLOCK_SIZE bsize,
                                                    int64_t this_sse,
                                                    int64_t best_sse,
@@ -57,7 +57,7 @@ static INLINE int early_term_inter_search_with_sse(int early_term_idx,
   return 0;
 }
 
-static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+static inline void init_best_pickmode(BEST_PICKMODE *bp) {
   bp->best_sse = INT64_MAX;
   bp->best_mode = NEARESTMV;
   bp->best_ref_frame = LAST_FRAME;
@@ -75,7 +75,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
 }
 
 // Copy best inter mode parameters to best_pickmode
-static INLINE void update_search_state_nonrd(
+static inline void update_search_state_nonrd(
     InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi,
     TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx,
     PREDICTION_MODE this_best_mode, const int64_t sse_y) {
@@ -99,7 +99,7 @@ static INLINE void update_search_state_nonrd(
   }
 }
 
-static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+static inline int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
                                 bool fullpel_performed_well) {
   const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
@@ -402,7 +402,7 @@ static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
   }
 }
 
-static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
+static inline void set_force_skip_flag(const AV1_COMP *const cpi,
                                        MACROBLOCK *const x, unsigned int sse,
                                        int *force_skip) {
   if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
@@ -570,7 +570,7 @@ static int ac_thr_factor(int speed, int width, int height, int norm_sum) {
 }
 
 // Sets early_term flag based on chroma planes prediction
-static INLINE void set_early_term_based_on_uv_plane(
+static inline void set_early_term_based_on_uv_plane(
     AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
     int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
     const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
@@ -656,7 +656,7 @@ static INLINE void set_early_term_based_on_uv_plane(
   }
 }
 
-static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
+static inline void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
                                               RD_STATS *rd_stats,
                                               int calculate_rd, int *early_term,
                                               BLOCK_SIZE bsize,
@@ -899,7 +899,7 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   rd_stats->dist = dist;
 }
 
-static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx,
+static inline int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx,
                                const MB_MODE_INFO_EXT *mbmi_ext,
                                const int (*const drl_mode_cost0)[2],
                                int8_t ref_frame_type) {
@@ -1017,7 +1017,7 @@ static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
   }
 }
 
-static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+static inline void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize,
                                            MV_REFERENCE_FRAME ref_frame,
                                            THR_MODES best_mode_idx,
diff --git a/av1/encoder/optical_flow.c b/av1/encoder/optical_flow.c
index 015d07d614..9703e6a65c 100644
--- a/av1/encoder/optical_flow.c
+++ b/av1/encoder/optical_flow.c
@@ -35,12 +35,12 @@ void av1_init_lk_params(LK_PARAMS *lk_params) {
 }
 
 // Helper function to determine whether a frame is encoded with high bit-depth.
-static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+static inline int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
   return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 }
 
 // Helper function to determine whether optical flow method is sparse.
-static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+static inline int is_sparse(const OPFL_PARAMS *opfl_params) {
   return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
 }
 
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index b4a59ff24f..a0c428b5a2 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -51,7 +51,7 @@ void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids,
  *
  * \remark Returns nothing, but saves each data's cluster index in \a indices.
  */
-static INLINE void av1_calc_indices(const int16_t *data,
+static inline void av1_calc_indices(const int16_t *data,
                                     const int16_t *centroids, uint8_t *indices,
                                     int n, int k, int dim) {
   assert(n > 0);
@@ -85,7 +85,7 @@ static INLINE void av1_calc_indices(const int16_t *data,
  *
  * \attention The output centroids are rounded off to nearest integers.
  */
-static INLINE void av1_k_means(const int16_t *data, int16_t *centroids,
+static inline void av1_k_means(const int16_t *data, int16_t *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
   assert(n > 0);
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index d31780fde2..5033e5da4d 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -3206,7 +3206,7 @@ static void init_partition_block_timing_stats(
   av1_zero(*part_timing_stats);
 }
 
-static INLINE void start_partition_block_timer(
+static inline void start_partition_block_timer(
     PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) {
   assert(!part_timing_stats->timer_is_on);
   part_timing_stats->partition_attempts[partition_type] += 1;
@@ -3214,7 +3214,7 @@ static INLINE void start_partition_block_timer(
   part_timing_stats->timer_is_on = 1;
 }
 
-static INLINE void end_partition_block_timer(
+static inline void end_partition_block_timer(
     PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type,
     int64_t rdcost) {
   if (part_timing_stats->timer_is_on) {
@@ -3225,7 +3225,7 @@ static INLINE void end_partition_block_timer(
     part_timing_stats->timer_is_on = 0;
   }
 }
-static INLINE void print_partition_timing_stats_with_rdcost(
+static inline void print_partition_timing_stats_with_rdcost(
     const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col,
     BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
     const RD_STATS *best_rdc, const char *filename) {
@@ -3253,7 +3253,7 @@ static INLINE void print_partition_timing_stats_with_rdcost(
   fclose(f);
 }
 
-static INLINE void print_partition_timing_stats(
+static inline void print_partition_timing_stats(
     const PartitionTimingStats *part_timing_stats, int intra_only,
     int show_frame, const BLOCK_SIZE bsize, const char *filename) {
   FILE *f = fopen(filename, "a");
@@ -3271,7 +3271,7 @@ static INLINE void print_partition_timing_stats(
   fclose(f);
 }
 
-static INLINE void accumulate_partition_timing_stats(
+static inline void accumulate_partition_timing_stats(
     FramePartitionTimingStats *fr_part_timing_stats,
     const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) {
   const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
@@ -3956,7 +3956,7 @@ static void rd_pick_4partition(
 }
 
 // Do not evaluate extended partitions if NONE partition is skippable.
-static INLINE int prune_ext_part_none_skippable(
+static inline int prune_ext_part_none_skippable(
     PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
     int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
   if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
@@ -6069,7 +6069,7 @@ static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
     for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
 }
 
-static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+static inline void copy_mbmi_ext_frame_to_mbmi_ext(
     MB_MODE_INFO_EXT *const mbmi_ext,
     const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
   memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index c6ae4fa473..1474270b65 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -78,7 +78,7 @@ static bool ext_ml_model_decision_after_part_ab(
     int *const partition_vert4_allowed, unsigned int pb_source_variance,
     int mi_row, int mi_col);
 
-static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+static inline int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_128X128: return 0;
     case BLOCK_64X64: return 1;
@@ -341,7 +341,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
   }
 }
 
-static INLINE int get_simple_motion_search_prune_agg(int qindex,
+static inline int get_simple_motion_search_prune_agg(int qindex,
                                                      int prune_level,
                                                      int is_rect_part) {
   assert(prune_level < TOTAL_AGG_LVLS);
@@ -994,7 +994,7 @@ static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
   }
 }
 
-static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+static inline void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
                                   int *feature_idx) {
   const int rd_valid = rd > 0 && rd < INT64_MAX;
   const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
@@ -2522,7 +2522,7 @@ void av1_prepare_motion_search_features_block(
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE void init_simple_motion_search_mvs(
+static inline void init_simple_motion_search_mvs(
     SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
   memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
   av1_zero(sms_tree->sms_none_feat);
diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index a7b1465b4e..288c1c3735 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h
@@ -144,7 +144,7 @@ void av1_prepare_motion_search_features_block(
 
 // A simplified version of set_offsets meant to be used for
 // simple_motion_search.
-static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+static inline void set_offsets_for_motion_search(const AV1_COMP *const cpi,
                                                  MACROBLOCK *const x,
                                                  int mi_row, int mi_col,
                                                  BLOCK_SIZE bsize) {
@@ -191,7 +191,7 @@ void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
                                               SIMPLE_MOTION_DATA_TREE *sms_root,
                                               int mi_row, int mi_col);
 
-static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+static inline int is_full_sb(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, BLOCK_SIZE sb_size) {
   const int sb_mi_wide = mi_size_wide[sb_size];
   const int sb_mi_high = mi_size_high[sb_size];
@@ -204,7 +204,7 @@ static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
 // Do not use this criteria for screen content videos.
 // Since screen content videos could often find good predictors and the largest
 // block size is likely to be used.
-static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
+static inline int use_auto_max_partition(const AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
   assert(IMPLIES(cpi->ppi->gf_group.size > 0,
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index a048901711..1a259c3e4d 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -967,7 +967,7 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group,
 }
 
 // Returns true if KF group and GF group both are almost completely static.
-static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+static inline int is_almost_static(double gf_zero_motion, int kf_zero_motion,
                                    int is_lap_enabled) {
   if (is_lap_enabled) {
     /*
@@ -982,7 +982,7 @@ static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
 }
 
 #define ARF_ABS_ZOOM_THRESH 4.4
-static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+static inline int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
                                 int flash_detected, int active_max_gf_interval,
                                 int active_min_gf_interval,
                                 GF_GROUP_STATS *gf_stats) {
@@ -2219,7 +2219,7 @@ static void define_gf_group_pass0(AV1_COMP *cpi) {
   }
 }
 
-static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+static inline void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
                                             int arf_position) {
   p_rc->baseline_gf_interval = arf_position;
 }
@@ -3418,7 +3418,7 @@ static int get_section_target_bandwidth(AV1_COMP *cpi) {
   return (int)section_target_bandwidth;
 }
 
-static INLINE void set_twopass_params_based_on_fp_stats(
+static inline void set_twopass_params_based_on_fp_stats(
     AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
   if (this_frame_ptr == NULL) return;
 
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 5355ee8661..3b726243df 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -26,7 +26,7 @@
 
 // Get primary and secondary filter strength for the given strength index and
 // search method
-static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+static inline void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
                                              int *pri_strength,
                                              int *sec_strength,
                                              int strength_idx) {
@@ -223,7 +223,7 @@ static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
   return best_tot_mse;
 }
 
-static INLINE void init_src_params(int *src_stride, int *width, int *height,
+static inline void init_src_params(int *src_stride, int *width, int *height,
                                    int *width_log2, int *height_log2,
                                    BLOCK_SIZE bsize) {
   *src_stride = block_size_wide[bsize];
@@ -260,7 +260,7 @@ static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
 
 // Checks dual and quad block processing is applicable for block widths 8 and 4
 // respectively.
-static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width,
+static inline int is_dual_or_quad_applicable(cdef_list *dlist, int width,
                                              int cdef_count, int bi, int iter) {
   assert(width == 8 || width == 4);
   const int blk_offset = (width == 8) ? 1 : 3;
@@ -314,7 +314,7 @@ static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
 
 // Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the
 // region is outside frame boundary
-static INLINE void fill_borders_for_fbs_on_frame_boundary(
+static inline void fill_borders_for_fbs_on_frame_boundary(
     uint16_t *inbuf, int hfilt_size, int vfilt_size,
     bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary,
     bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) {
@@ -398,7 +398,7 @@ static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units(
 }
 
 // Returns the block error after CDEF filtering for a given strength
-static INLINE uint64_t get_filt_error(
+static inline uint64_t get_filt_error(
     const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd,
     cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
     int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer,
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index 5a1ec2157a..7d79bd5f41 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -171,7 +171,7 @@ typedef struct {
   bool use_highbitdepth;
 } CdefSearchCtx;
 
-static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+static inline int sb_all_skip(const CommonModeInfoParams *const mi_params,
                               int mi_row, int mi_col) {
   const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
   const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
@@ -194,7 +194,7 @@ static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
 // Returns:
 //   1/0 will be returned to indicate skip/don't skip cdef processing of sb
 //   respectively.
-static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+static inline int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
                                int fbr, int fbc) {
   const MB_MODE_INFO *const mbmi =
       mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 3bb6e6ba6a..63756340dc 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1109,14 +1109,14 @@ void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE int wrap_index(int i, int wiener_win) {
+static inline int wrap_index(int i, int wiener_win) {
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
 }
 
 // Splits each w[i] into smaller components w1[i] and w2[i] such that
 // w[i] = w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i].
-static INLINE void split_wiener_filter_coefficients(int wiener_win,
+static inline void split_wiener_filter_coefficients(int wiener_win,
                                                     const int32_t *w,
                                                     int32_t *w1, int32_t *w2) {
   for (int i = 0; i < wiener_win; i++) {
@@ -1131,7 +1131,7 @@ static INLINE void split_wiener_filter_coefficients(int wiener_win,
 //
 // The multiplication x * w may overflow, so we multiply x by the components of
 // w (w1 and w2) and combine the multiplication with the division.
-static INLINE int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) {
+static inline int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) {
   // Let y = x * w / WIENER_TAP_SCALE_FACTOR
   //       = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR
   const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR;
@@ -1949,7 +1949,7 @@ static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
   }
 }
 
-static INLINE void av1_derive_flags_for_lr_processing(
+static inline void av1_derive_flags_for_lr_processing(
     const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) {
   const bool is_wiener_disabled = lpf_sf->disable_wiener_filter;
   const bool is_sgr_disabled = lpf_sf->disable_sgr_filter;
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index c4cad30882..22bb000c11 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -57,7 +57,7 @@ static const uint8_t g_shuffle_stats_highbd_data[32] = {
   0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
 };
 
-static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
+static inline uint8_t find_average(const uint8_t *src, int h_start, int h_end,
                                    int v_start, int v_end, int stride) {
   uint64_t sum = 0;
   for (int i = v_start; i < v_end; i++) {
@@ -70,7 +70,7 @@ static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
+static inline uint16_t find_average_highbd(const uint16_t *src, int h_start,
                                            int h_end, int v_start, int v_end,
                                            int stride) {
   uint64_t sum = 0;
diff --git a/av1/encoder/random.h b/av1/encoder/random.h
index 9a34f1f7a7..f4b51ba8d8 100644
--- a/av1/encoder/random.h
+++ b/av1/encoder/random.h
@@ -22,13 +22,13 @@ extern "C" {
 // Note that the low bits of this output are comparatively low-quality, so users
 // of this function should ensure that the high bits factor through to their
 // outputs.
-static INLINE uint32_t lcg_next(uint32_t *state) {
+static inline uint32_t lcg_next(uint32_t *state) {
   *state = (uint32_t)(*state * 1103515245ULL + 12345);
   return *state;
 }
 
 // Generate a random number in the range [0, 32768).
-static INLINE uint32_t lcg_rand16(uint32_t *state) {
+static inline uint32_t lcg_rand16(uint32_t *state) {
   return (lcg_next(state) / 65536) % 32768;
 }
 
@@ -37,13 +37,13 @@ static INLINE uint32_t lcg_rand16(uint32_t *state) {
 // rand() % n, for a few reasons: This implementation is faster and less biased,
 // and if is a power of 2, this uses the higher-quality top bits from the RNG
 // output rather than the lower-quality bottom bits.
-static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) {
+static inline uint32_t lcg_randint(uint32_t *state, uint32_t n) {
   uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32;
   return (uint32_t)v;
 }
 
 // Generate a random number in the range [lo, hi)
-static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
+static inline uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
                                      uint32_t hi) {
   assert(lo < hi);
   return lo + lcg_randint(state, hi - lo);
@@ -56,7 +56,7 @@ static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
 // Note: The algorithm used here uses resampling to avoid choosing repeated
 // values. This works well as long as n >> k, but can potentially lead to many
 // resampling attempts if n is equal to or only slightly larger than k.
-static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) {
+static inline void lcg_pick(int n, int k, int *out, unsigned int *seed) {
   assert(0 <= k && k <= n);
   for (int i = 0; i < k; i++) {
     int v;
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 09129425d0..7e2b029d4c 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3622,7 +3622,7 @@ static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
   return;
 }
 
-static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+static inline int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
   AV1_COMMON *const cm = &cpi->common;
   SVC *const svc = &cpi->svc;
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 8a719160fb..e209693bb7 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -688,7 +688,7 @@ void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
 // WARNING: Population of unified cost update frequency needs to be taken care
 // accordingly, in case of any modifications/additions to the enum
 // COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE.
-static INLINE void populate_unified_cost_update_freq(
+static inline void populate_unified_cost_update_freq(
     const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) {
   INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
   // Mapping of entropy cost update frequency from the encoder's codec control
@@ -715,7 +715,7 @@ static INLINE void populate_unified_cost_update_freq(
 }
 
 // Checks if entropy costs should be initialized/updated at frame level or not.
-static INLINE int is_frame_level_cost_upd_freq_set(
+static inline int is_frame_level_cost_upd_freq_set(
     const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level,
     const int use_nonrd_pick_mode, const int frames_since_key) {
   const int fill_costs =
@@ -1202,7 +1202,7 @@ void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
 // In the worst case, this requires a border of
 //   max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels
 // around the frame edges.
-static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+static inline void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                 MV *mv) {
   int bw = xd->width << MI_SIZE_LOG2;
   int bh = xd->height << MI_SIZE_LOG2;
@@ -1523,7 +1523,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_D45_PRED] = 2500;
 }
 
-static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+static inline void update_thr_fact(int (*factor_buf)[MAX_MODES],
                                    THR_MODES best_mode_index,
                                    THR_MODES mode_start, THR_MODES mode_end,
                                    BLOCK_SIZE min_size, BLOCK_SIZE max_size,
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 004f65353d..a616e7eceb 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -74,7 +74,7 @@ static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = {
   { INTRA_FRAME, NONE_FRAME }
 };
 
-static INLINE int mode_offset(const PREDICTION_MODE mode) {
+static inline int mode_offset(const PREDICTION_MODE mode) {
   if (mode >= NEARESTMV) {
     return INTER_OFFSET(mode);
   } else {
@@ -114,7 +114,7 @@ typedef struct RD_OPT {
   double r0;
 } RD_OPT;
 
-static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+static inline void av1_init_rd_stats(RD_STATS *rd_stats) {
 #if CONFIG_RD_DEBUG
   int plane;
 #endif
@@ -133,7 +133,7 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
 #endif
 }
 
-static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+static inline void av1_invalid_rd_stats(RD_STATS *rd_stats) {
 #if CONFIG_RD_DEBUG
   int plane;
 #endif
@@ -152,7 +152,7 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
 #endif
 }
 
-static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+static inline void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
                                       const RD_STATS *rd_stats_src) {
   if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
     // If rd_stats_dst or rd_stats_src has invalid rate, we will make
@@ -178,7 +178,7 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
 #endif
 }
 
-static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+static inline void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
                                            int rate, int skip_txfm, int64_t sse,
                                            int zero_rate) {
   assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
@@ -189,7 +189,7 @@ static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
   rd_stats->sse += sse;
 }
 
-static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+static inline int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
   assert(mult >= 0);
   if (rate >= 0) {
     return RDCOST(mult, rate, dist);
@@ -197,7 +197,7 @@ static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
   return RDCOST_NEG_R(mult, -rate, dist);
 }
 
-static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+static inline void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
   if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
       rd_cost->rdcost < INT64_MAX) {
     rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
@@ -206,7 +206,7 @@ static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
   }
 }
 
-static INLINE void av1_rd_stats_subtraction(int mult,
+static inline void av1_rd_stats_subtraction(int mult,
                                             const RD_STATS *const left,
                                             const RD_STATS *const right,
                                             RD_STATS *result) {
@@ -284,7 +284,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                THR_MODES intra_mode_start,
                                THR_MODES intra_mode_end);
 
-static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+static inline void reset_thresh_freq_fact(MACROBLOCK *const x) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
     for (int j = 0; j < MAX_MODES; ++j) {
       x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
@@ -292,7 +292,7 @@ static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
   }
 }
 
-static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
+static inline int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
                                       int thresh_fact) {
   return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
@@ -302,13 +302,13 @@ void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
                  BLOCK_SIZE block_size);
 
 // Sets the multiplier to convert mv cost to l2 error during motion search.
-static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) {
+static inline void av1_set_error_per_bit(int *errorperbit, int rdmult) {
   *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
 }
 
 // Get the threshold for R-D optimization of coefficients depending upon mode
 // decision/winner mode processing
-static INLINE void get_rd_opt_coeff_thresh(
+static inline void get_rd_opt_coeff_thresh(
     const uint32_t (*const coeff_opt_threshold)[2],
     TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt,
     int is_winner_mode) {
@@ -338,7 +338,7 @@ static INLINE void get_rd_opt_coeff_thresh(
 }
 
 // Used to reset the state of mb rd hash information
-static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
+static inline void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
   if (!mb_rd_record) return;
 
   // Reset the state for use_mb_rd_hash
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 5954bdb8e3..90a13602b9 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -730,7 +730,7 @@ static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
   }
 }
 
-static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+static inline PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
                                               int ref_idx) {
   return ref_idx ? compound_ref1_mode(this_mode)
                  : compound_ref0_mode(this_mode);
@@ -968,7 +968,7 @@ static AOM_INLINE void setup_buffer_ref_mvs_inter(
 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
 
 // TODO(jingning): this mv clamping function should be block size dependent.
-static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+static inline void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
   const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
                                      xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
                                      xd->mb_to_top_edge - LEFT_TOP_MARGIN,
@@ -1035,7 +1035,7 @@ static int skip_repeated_mv(const AV1_COMMON *const cm,
   return 0;
 }
 
-static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+static inline int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
                                      const AV1_COMMON *cm,
                                      const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1050,7 +1050,7 @@ static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
 // To use single newmv directly for compound modes, need to clamp the mv to the
 // valid mv range. Without this, encoder would generate out of range mv, and
 // this is seen in 8k encoding.
-static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+static inline void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
                                      int ref_idx) {
   const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
   SubpelMvLimits mv_limits;
@@ -1164,7 +1164,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return 0;
 }
 
-static INLINE void update_mode_start_end_index(
+static inline void update_mode_start_end_index(
     const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
     int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
     int interintra_allowed, int eval_motion_mode) {
@@ -1687,7 +1687,7 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
 // with global motion. The issue is that, when global motion is used, GLOBALMV
 // produces a different prediction to NEARESTMV/NEARMV even if the motion
 // vectors are the same. Thus GLOBALMV should not be pruned in this case.
-static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+static inline int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
                                       int ref_idx,
                                       const MV_REFERENCE_FRAME *ref_frame,
                                       PREDICTION_MODE single_mode) {
@@ -1724,7 +1724,7 @@ static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
   return 0;
 }
 
-static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+static inline int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
                               int ref_idx, int ref_mv_idx,
                               int skip_repeated_ref_mv,
                               const MV_REFERENCE_FRAME *ref_frame,
@@ -1763,7 +1763,7 @@ static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
 
 // Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
 // population
-static INLINE int skip_nearest_near_mv_using_refmv_weight(
+static inline int skip_nearest_near_mv_using_refmv_weight(
     const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
     const int8_t ref_frame_type, PREDICTION_MODE best_mode) {
   if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
@@ -1804,7 +1804,7 @@ static INLINE int skip_nearest_near_mv_using_refmv_weight(
 }
 
 // This function update the non-new mv for the current prediction mode
-static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+static inline int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
                                const AV1_COMMON *cm, const MACROBLOCK *x,
                                int skip_repeated_ref_mv) {
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -1833,7 +1833,7 @@ static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
   return ret;
 }
 
-static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+static inline int get_drl_cost(const MB_MODE_INFO *mbmi,
                                const MB_MODE_INFO_EXT *mbmi_ext,
                                const int (*const drl_mode_cost0)[2],
                                int8_t ref_frame_type) {
@@ -1862,7 +1862,7 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
   return cost;
 }
 
-static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+static inline int is_single_newmv_valid(const HandleInterModeArgs *const args,
                                         const MB_MODE_INFO *const mbmi,
                                         PREDICTION_MODE this_mode) {
   for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
@@ -2041,9 +2041,9 @@ static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
 // Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
 // an integer. 0 for the i-th bit means that integer is excluded, 1 means
 // it is included.
-static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+static inline void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
 
-static INLINE bool mask_check_bit(int mask, int index) {
+static inline bool mask_check_bit(int mask, int index) {
   return (mask >> index) & 0x1;
 }
 
@@ -4316,7 +4316,7 @@ static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
 
 // Check if reference frame pair of the current block matches with the given
 // block.
-static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+static inline int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
                                        const MV_REFERENCE_FRAME *ref_frames) {
   return ((ref_frames[0] == mbmi->ref_frame[0]) &&
           (ref_frames[1] == mbmi->ref_frame[1]));
@@ -4477,7 +4477,7 @@ static int inter_mode_search_order_independent_skip(
   return 0;
 }
 
-static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+static inline void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
                              const MV_REFERENCE_FRAME *ref_frames,
                              const AV1_COMMON *cm) {
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
@@ -4730,7 +4730,7 @@ static int compound_skip_by_single_states(
 }
 
 // Check if ref frames of current block matches with given block.
-static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+static inline void match_ref_frame(const MB_MODE_INFO *const mbmi,
                                    const MV_REFERENCE_FRAME *ref_frames,
                                    int *const is_ref_match) {
   if (is_inter_block(mbmi)) {
@@ -4744,7 +4744,7 @@ static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
 }
 
 // Prune compound mode using ref frames of neighbor blocks.
-static INLINE int compound_skip_using_neighbor_refs(
+static inline int compound_skip_using_neighbor_refs(
     MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
     const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
   // Exclude non-extended compound modes from pruning
@@ -4773,7 +4773,7 @@ static INLINE int compound_skip_using_neighbor_refs(
 }
 
 // Update best single mode for the given reference frame based on simple rd.
-static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+static inline void update_best_single_mode(InterModeSearchState *search_state,
                                            const PREDICTION_MODE this_mode,
                                            const MV_REFERENCE_FRAME ref_frame,
                                            int64_t this_rd) {
@@ -4784,7 +4784,7 @@ static INLINE void update_best_single_mode(InterModeSearchState *search_state,
 }
 
 // Prune compound mode using best single mode for the same reference.
-static INLINE int skip_compound_using_best_single_mode_ref(
+static inline int skip_compound_using_best_single_mode_ref(
     const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
     const PREDICTION_MODE *best_single_mode,
     int prune_comp_using_best_single_mode_ref) {
@@ -4828,7 +4828,7 @@ static int compare_int64(const void *a, const void *b) {
   }
 }
 
-static INLINE void update_search_state(
+static inline void update_search_state(
     InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
     PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
     const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
@@ -4882,7 +4882,7 @@ static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
 }
 
 // Check if either frame is within the cutoff.
-static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+static inline bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
                                         MV_REFERENCE_FRAME frame1,
                                         MV_REFERENCE_FRAME frame2) {
   assert(frame2 > 0);
@@ -6390,7 +6390,7 @@ struct calc_target_weighted_pred_ctxt {
 };
 /*!\endcond */
 
-static INLINE void calc_target_weighted_pred_above(
+static inline void calc_target_weighted_pred_above(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
@@ -6438,7 +6438,7 @@ static INLINE void calc_target_weighted_pred_above(
   }
 }
 
-static INLINE void calc_target_weighted_pred_left(
+static inline void calc_target_weighted_pred_left(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index a1fa7075ef..dcdf3d29aa 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -166,17 +166,17 @@ void av1_rd_pick_inter_mode_sb_seg_skip(
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
-static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+static inline int coded_to_superres_mi(int mi_col, int denom) {
   return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
 }
 
-static INLINE int av1_encoder_get_relative_dist(int a, int b) {
+static inline int av1_encoder_get_relative_dist(int a, int b) {
   assert(a >= 0 && b >= 0);
   return (a - b);
 }
 
 // This function will return number of mi's in a superblock.
-static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+static inline int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
   const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
   int sb_mi_rows =
       (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
@@ -190,7 +190,7 @@ static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
 
 // This function prunes the mode if either of the reference frame falls in the
 // pruning list
-static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+static inline int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
                             const unsigned int *const ref_display_order_hint,
                             const unsigned int frame_display_order_hint,
                             const int *ref_frame_list) {
@@ -208,7 +208,7 @@ static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
   return 0;
 }
 
-static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
+static inline int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
                                          int8_t closest_past_ref,
                                          int8_t closest_future_ref) {
   int has_closest_past_ref =
@@ -218,7 +218,7 @@ static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
   return (has_closest_past_ref && has_closest_future_ref);
 }
 
-static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
+static inline int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
                                        const MACROBLOCK *const x) {
   int has_best_past_pred_mv_sad = 0;
   int has_best_future_pred_mv_sad = 0;
@@ -233,7 +233,7 @@ static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
   return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad);
 }
 
-static INLINE int prune_ref_by_selective_ref_frame(
+static inline int prune_ref_by_selective_ref_frame(
     const AV1_COMP *const cpi, const MACROBLOCK *const x,
     const MV_REFERENCE_FRAME *const ref_frame,
     const unsigned int *const ref_display_order_hint) {
@@ -307,7 +307,7 @@ static INLINE int prune_ref_by_selective_ref_frame(
 
 // This function will copy the best reference mode information from
 // MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
-static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+static inline void av1_copy_mbmi_ext_to_mbmi_ext_frame(
     MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
     const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
   memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 349a41af6c..2376bafffe 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -346,7 +346,7 @@ static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
   return num_blk;
 }
 
-static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+static inline int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
                                   int64_t best_skip_rd, int64_t skip_rd,
                                   int level, int is_luma_only) {
   int eval_txfm = 1;
@@ -402,7 +402,7 @@ static TX_MODE select_tx_mode(
 }
 
 // Checks the conditions to disable winner mode processing
-static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
+static inline int bypass_winner_mode_processing(const MACROBLOCK *const x,
                                                 const SPEED_FEATURES *sf,
                                                 int use_txfm_skip,
                                                 int actual_txfm_skip,
@@ -443,7 +443,7 @@ static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
 }
 
 // Checks the conditions to enable winner mode processing
-static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
+static inline int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
                                                     const MACROBLOCK *const x,
                                                     MB_MODE_INFO *const mbmi,
                                                     int actual_txfm_skip) {
@@ -477,7 +477,7 @@ static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
   return 0;
 }
 
-static INLINE void set_tx_size_search_method(
+static inline void set_tx_size_search_method(
     const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
     TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
     int is_winner_mode) {
@@ -496,7 +496,7 @@ static INLINE void set_tx_size_search_method(
       select_tx_mode(cm, txfm_params->tx_size_search_method);
 }
 
-static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+static inline void set_tx_type_prune(const SPEED_FEATURES *sf,
                                      TxfmSearchParams *txfm_params,
                                      int winner_mode_tx_type_pruning,
                                      int is_winner_mode) {
@@ -512,7 +512,7 @@ static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
       prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
 }
 
-static INLINE void set_tx_domain_dist_params(
+static inline void set_tx_domain_dist_params(
     const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
     int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
   if (txfm_params->use_qm_dist_metric) {
@@ -545,7 +545,7 @@ static INLINE void set_tx_domain_dist_params(
 }
 
 // This function sets mode parameters for different mode evaluation stages
-static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+static inline void set_mode_eval_params(const struct AV1_COMP *cpi,
                                         MACROBLOCK *x,
                                         MODE_EVAL_TYPE mode_eval_type) {
   const AV1_COMMON *cm = &cpi->common;
@@ -648,7 +648,7 @@ static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
 
 // Similar to store_cfl_required(), but for use during the RDO process,
 // where we haven't yet determined whether this block uses CfL.
-static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+static inline CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
                                                       const MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
 
@@ -674,7 +674,7 @@ static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
 }
 
 // Store best mode stats for winner mode processing
-static INLINE void store_winner_mode_stats(
+static inline void store_winner_mode_stats(
     const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
     RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
     THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
@@ -756,14 +756,14 @@ unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bsize, int plane);
 
-static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+static inline int is_mode_intra(PREDICTION_MODE mode) {
   return mode < INTRA_MODE_END;
 }
 
 // This function will copy usable ref_mv_stack[ref_frame][4] and
 // weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
 // weight[ref_frame][8].
-static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+static inline void av1_copy_usable_ref_mv_stack_and_weight(
     const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
     MV_REFERENCE_FRAME ref_frame) {
   memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
@@ -773,7 +773,7 @@ static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
 }
 
 // Get transform rd gate level for the given transform search case.
-static INLINE int get_txfm_rd_gate_level(
+static inline int get_txfm_rd_gate_level(
     const int is_masked_compound_enabled,
     const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
     TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 1cbe0f9c11..afb59f59ed 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -165,7 +165,7 @@ static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
                        num_planes);
 }
 
-static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+static inline void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
                                          int rel_mi_col, uint8_t op_mi_size,
                                          int dir, MB_MODE_INFO *above_mbmi,
                                          void *fun_ctxt, const int num_planes) {
diff --git a/av1/encoder/saliency_map.c b/av1/encoder/saliency_map.c
index 90672ba035..285161c3a4 100644
--- a/av1/encoder/saliency_map.c
+++ b/av1/encoder/saliency_map.c
@@ -176,7 +176,7 @@ static void get_color_intensity(const YV12_BUFFER_CONFIG *src,
   }
 }
 
-static INLINE double convolve_map(const double *filter, const double *map,
+static inline double convolve_map(const double *filter, const double *map,
                                   const int size) {
   double result = 0;
   for (int i = 0; i < size; ++i) {
@@ -187,7 +187,7 @@ static INLINE double convolve_map(const double *filter, const double *map,
 
 // This function is to decimate the map by half, and apply Gaussian filter on
 // top of the downsampled map.
-static INLINE void decimate_map(const double *map, int height, int width,
+static inline void decimate_map(const double *map, int height, int width,
                                 int stride, double *downsampled_map) {
   const int new_width = width / 2;
   const int window_size = 5;
@@ -217,7 +217,7 @@ static INLINE void decimate_map(const double *map, int height, int width,
 
 // This function is to upscale the map from in_level size to out_level size.
 // Note that the map at "level-1" will upscale the map at "level" by x2.
-static INLINE int upscale_map(const double *input, int in_level, int out_level,
+static inline int upscale_map(const double *input, int in_level, int out_level,
                               int height[9], int width[9], double *output) {
   for (int level = in_level; level > out_level; level--) {
     const int cur_width = width[level];
@@ -503,7 +503,7 @@ static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9],
   return 1;
 }
 
-static INLINE void filter2d(const double *input, const double kernel[9][9],
+static inline void filter2d(const double *input, const double kernel[9][9],
                             int width, int height, double *output) {
   const int window_size = 9;
   double map_section[81];
@@ -617,7 +617,7 @@ static int get_feature_map_orientation(const double *intensity, int width[9],
   return 1;
 }
 
-static INLINE void find_min_max(const saliency_feature_map *input,
+static inline void find_min_max(const saliency_feature_map *input,
                                 double *max_value, double *min_value) {
   assert(input && input->buf);
   *min_value = DBL_MAX;
@@ -632,7 +632,7 @@ static INLINE void find_min_max(const saliency_feature_map *input,
   }
 }
 
-static INLINE double average_local_max(const saliency_feature_map *input,
+static inline double average_local_max(const saliency_feature_map *input,
                                        int stepsize) {
   int numlocal = 0;
   double lmaxmean = 0, lmax = 0, dummy = 0;
diff --git a/av1/encoder/sparse_linear_solver.c b/av1/encoder/sparse_linear_solver.c
index 90d8e08ecf..d2edb26020 100644
--- a/av1/encoder/sparse_linear_solver.c
+++ b/av1/encoder/sparse_linear_solver.c
@@ -187,7 +187,7 @@ void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
   }
 }
 
-static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+static inline void free_solver_local_buf(double *buf1, double *buf2,
                                          double *buf3, double *buf4,
                                          double *buf5, double *buf6,
                                          double *buf7) {
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 2458f25843..0fdbe647a1 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -53,7 +53,7 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
 
 // This function returns the minimum and maximum log variances for 4x4 sub
 // blocks in the current block.
-static INLINE void get_log_var_4x4sub_blk(
+static inline void get_log_var_4x4sub_blk(
     AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
     int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
     double *blk_4x4_var_max, int is_hbd) {
@@ -352,7 +352,7 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
 }
 
 // Helper function to determine whether a frame is encoded with high bit-depth.
-static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+static inline int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
   return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 }
 
@@ -520,7 +520,7 @@ static void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
 // Returns:
 //   Nothing will be returned. But the content to which `square_diff` points
 //   will be modified.
-static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+static inline void compute_square_diff(const uint8_t *ref, const int ref_offset,
                                        const int ref_stride, const uint8_t *tgt,
                                        const int tgt_offset,
                                        const int tgt_stride, const int height,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 2040045ded..39b13fa1ee 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -428,7 +428,7 @@ static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
 // Returns:
 //   Nothing will be returned. Contents of input_mbmi and input_buffer will be
 //   modified.
-static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+static inline void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
                                  uint8_t **input_buffer, int num_planes) {
   for (int i = 0; i < num_planes; i++) {
     input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -444,7 +444,7 @@ static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
 //   num_planes: Number of planes.
 // Returns:
 //   Nothing will be returned. Contents of mbd will be modified.
-static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+static inline void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
                                     uint8_t **input_buffer, int num_planes) {
   for (int i = 0; i < num_planes; i++) {
     mbd->plane[i].pre[0].buf = input_buffer[i];
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index d3795c63bc..3679293319 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -95,14 +95,14 @@ void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
                             COLOR_MAP_TYPE type, int allow_update_cdf,
                             struct FRAME_COUNTS *counts);
 
-static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+static inline int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
                                  TX_SIZE tx_size) {
   const int eob_max = av1_get_max_eob(tx_size);
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 // Token buffer is only used for palette tokens.
-static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+static inline unsigned int get_token_alloc(int mb_rows, int mb_cols,
                                            int sb_size_log2,
                                            const int num_planes) {
   // Calculate the maximum number of max superblocks in the image.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 9faf2eaddf..339e8a5d51 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -37,7 +37,7 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
 
-static INLINE double exp_bounded(double v) {
+static inline double exp_bounded(double v) {
   // When v > 700 or <-700, the exp function will be close to overflow
   // For details, see the "Notes" in the following link.
   // https://en.cppreference.com/w/c/numeric/math/exp
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 8f08702eb2..6a1299cba9 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -38,7 +38,7 @@ struct TPL_INFO;
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/ratectrl.h"
 
-static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+static inline BLOCK_SIZE convert_length_to_bsize(int length) {
   switch (length) {
     case 64: return BLOCK_64X64;
     case 32: return BLOCK_32X32;
@@ -294,7 +294,7 @@ typedef struct {
 #endif  // CONFIG_THREE_PASS
 } VBR_RATECTRL_INFO;
 
-static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+static inline void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
   vbr_rc_info->q_index_list_ready = 0;
   av1_zero(vbr_rc_info->q_index_list);
 }
@@ -731,14 +731,14 @@ typedef struct {
   double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
 } RATECTRL_LOG;
 
-static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
+static inline void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
 
-static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
+static inline void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
                                       const TplTxfmStats *txfm_stats) {
   rc_log->txfm_stats_list[coding_index] = *txfm_stats;
 }
 
-static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
+static inline void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
                                              int coding_index,
                                              double qstep_ratio, int q_index,
                                              FRAME_UPDATE_TYPE update_type) {
@@ -754,21 +754,21 @@ static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
   }
 }
 
-static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
+static inline void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
                                         double act_rate,
                                         double act_coeff_rate) {
   rc_log->act_rate_list[coding_index] = act_rate;
   rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate;
 }
 
-static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
+static inline void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
                                             int base_q_index,
                                             int coding_frame_count) {
   rc_log->base_q_index = base_q_index;
   rc_log->coding_frame_count = coding_frame_count;
 }
 
-static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) {
+static inline void rc_log_show(const RATECTRL_LOG *rc_log) {
   printf("= chunk 1\n");
   printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count,
          rc_log->base_q_index);
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index f4aab493ea..8225fe35e7 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -77,7 +77,7 @@ static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
                                                      12, 12, 23, 23, 32, 32, 8,
                                                      8,  16, 16, 23, 23 };
 
-static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+static inline uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int16_t *diff = x->plane[0].src_diff;
@@ -87,7 +87,7 @@ static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   return (hash << 5) + bsize;
 }
 
-static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+static inline int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
                                       const int64_t ref_best_rd,
                                       const uint32_t hash) {
   int32_t match_index = -1;
@@ -145,7 +145,7 @@ int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
 
 // Computes the residual block's SSE and mean on all visible 4x4s in the
 // transform block
-static INLINE int64_t pixel_diff_stats(
+static inline int64_t pixel_diff_stats(
     MACROBLOCK *x, int plane, int blk_row, int blk_col,
     const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
     unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
@@ -863,7 +863,7 @@ static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
                               dst_stride, eob, reduced_tx_set);
 }
 
-static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+static inline void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                                int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                const TXB_CTX *const txb_ctx, int skip_trellis,
@@ -963,7 +963,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
   return sse;
 }
 
-static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+static inline int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
                                            int plane, BLOCK_SIZE plane_bsize,
                                            int block, int blk_row, int blk_col,
                                            TX_SIZE tx_size) {
@@ -1018,7 +1018,7 @@ static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
 static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
 
 // R-D costs are sorted in ascending order.
-static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+static inline void sort_rd(int64_t rds[], int txk[], int len) {
   int i, j, k;
 
   for (i = 1; i <= len - 1; ++i) {
@@ -1043,7 +1043,7 @@ static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   }
 }
 
-static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+static inline int64_t av1_block_error_qm(const tran_low_t *coeff,
                                          const tran_low_t *dqcoeff,
                                          intptr_t block_size,
                                          const qm_val_t *qmatrix,
@@ -1071,7 +1071,7 @@ static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
   return error;
 }
 
-static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+static inline void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
                                         TX_SIZE tx_size,
                                         const qm_val_t *qmatrix,
                                         const int16_t *scan, int64_t *out_dist,
@@ -1384,7 +1384,7 @@ static const float *prune_2D_adaptive_thresholds[] = {
   NULL,
 };
 
-static INLINE float get_adaptive_thresholds(
+static inline float get_adaptive_thresholds(
     TX_SIZE tx_size, TxSetType tx_set_type,
     TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
   const int prune_aggr_table[5][2] = {
@@ -1712,11 +1712,11 @@ static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
   return clamp(int_score, -80000, 80000);
 }
 
-static INLINE uint16_t
-get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
-            int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-            const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
-            int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+static inline uint16_t get_tx_mask(
+    const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row,
+    int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+    int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
@@ -1887,13 +1887,13 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
 }
 
 #if CONFIG_RD_DEBUG
-static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+static inline void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
                                          int txb_coeff_cost) {
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
 }
 #endif
 
-static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+static inline int cost_coeffs(MACROBLOCK *x, int plane, int block,
                               TX_SIZE tx_size, const TX_TYPE tx_type,
                               const TXB_CTX *const txb_ctx,
                               int reduced_tx_set_used) {
@@ -1947,7 +1947,7 @@ static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
 
 // Predict DC only blocks if the residual variance is below a qstep based
 // threshold.For such blocks, transform type search is bypassed.
-static INLINE void predict_dc_only_block(
+static inline void predict_dc_only_block(
     MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
     int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
     int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 801da94ca7..b6f633e0a4 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -14,7 +14,7 @@
 
 #include "av1/common/idct.h"
 
-static INLINE void update_coeff_general(
+static inline void update_coeff_general(
     int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
     TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift,
     int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
@@ -239,7 +239,7 @@ static AOM_FORCE_INLINE void update_coeff_eob(
   }
 }
 
-static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+static inline void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
                                int nz_num, int *nz_ci, int64_t rdmult,
                                int skip_cost, int non_skip_cost,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff) {
diff --git a/av1/encoder/txb_rdopt_utils.h b/av1/encoder/txb_rdopt_utils.h
index 56245f503f..b7cdd922d7 100644
--- a/av1/encoder/txb_rdopt_utils.h
+++ b/av1/encoder/txb_rdopt_utils.h
@@ -36,7 +36,7 @@ static const int const_term = (1 << AV1_PROB_COST_SHIFT);
 
 static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
 
-static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+static inline int get_dqv(const int16_t *dequant, int coeff_idx,
                           const qm_val_t *iqmatrix) {
   int dqv = dequant[!!coeff_idx];
   if (iqmatrix != NULL)
@@ -45,7 +45,7 @@ static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
   return dqv;
 }
 
-static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+static inline int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
                                      int shift, const qm_val_t *qmatrix,
                                      int coeff_idx) {
   int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
@@ -82,7 +82,7 @@ static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
   return eob_cost;
 }
 
-static INLINE int get_golomb_cost(int abs_qc) {
+static inline int get_golomb_cost(int abs_qc) {
   if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
     const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
     const int length = get_msb(r) + 1;
@@ -91,12 +91,12 @@ static INLINE int get_golomb_cost(int abs_qc) {
   return 0;
 }
 
-static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+static inline int get_br_cost(tran_low_t level, const int *coeff_lps) {
   const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
   return coeff_lps[base_range] + get_golomb_cost(level);
 }
 
-static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+static inline int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
                                         int *diff) {
   const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
   int golomb_bits = 0;
@@ -142,7 +142,7 @@ static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
   return cost;
 }
 
-static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+static inline int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
                                      int coeff_ctx, int dc_sign_ctx,
                                      const LV_MAP_COEFF_COST *txb_costs,
                                      int bhl, TX_CLASS tx_class) {
@@ -163,7 +163,7 @@ static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
   return cost;
 }
 
-static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+static inline int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
                                          int sign, int coeff_ctx,
                                          int dc_sign_ctx,
                                          const LV_MAP_COEFF_COST *txb_costs,
@@ -193,7 +193,7 @@ static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
   return cost;
 }
 
-static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+static inline void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
                                   int shift, tran_low_t *qc_low,
                                   tran_low_t *dqc_low) {
   tran_low_t abs_qc_low = abs_qc - 1;
@@ -204,7 +204,7 @@ static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
   assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
 }
 
-static INLINE void update_coeff_eob_fast(int *eob, int shift,
+static inline void update_coeff_eob_fast(int *eob, int shift,
                                          const int16_t *dequant_ptr,
                                          const int16_t *scan,
                                          const tran_low_t *coeff_ptr,
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index b217a85447..9b25ae96e8 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -20,7 +20,7 @@
 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -113,7 +113,7 @@ static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
   output[15] = x1[15];
 }
 
-static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+static inline void fdct16x32_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -304,7 +304,7 @@ static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
   output[31] = x1[31];
 }
 
-static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -711,7 +711,7 @@ static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
   output[63] = x1[63];
 }
 
-static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+static inline void fdct32_avx2(const __m256i *input, __m256i *output,
                                int8_t cos_bit) {
   __m256i x1[32];
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -865,7 +865,7 @@ static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
   output[31] = x1[31];
 }
 
-static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+static inline void fdct64_new_avx2(const __m256i *input, __m256i *output,
                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -1278,7 +1278,7 @@ static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
   output[63] = x1[63];
 }
 
-static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
                                        int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __zero = _mm256_setzero_si256();
@@ -1408,7 +1408,7 @@ static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
   output[15] = x1[0];
 }
 
-static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+static inline void fidentity16x16_new_avx2(const __m256i *input,
                                            __m256i *output, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i one = _mm256_set1_epi16(1);
@@ -1422,7 +1422,7 @@ static INLINE void fidentity16x16_new_avx2(const __m256i *input,
   }
 }
 
-static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output,
                                        int8_t cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 32; ++i) {
@@ -1430,7 +1430,7 @@ static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
   }
 }
 
-static INLINE void store_output_32bit_w16(int32_t *const out,
+static inline void store_output_32bit_w16(int32_t *const out,
                                           const __m256i *const in1,
                                           const __m256i *const in2,
                                           const int stride,
@@ -1442,7 +1442,7 @@ static INLINE void store_output_32bit_w16(int32_t *const out,
 }
 
 // Store 8 16 bit values. Sign extend the values.
-static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
                                                         int32_t *out,
                                                         const int stride,
                                                         const int out_size) {
@@ -1456,7 +1456,7 @@ static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
   }
 }
 
-static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+static inline void store_rect_16bit_to_32bit_avx2(const __m256i a,
                                                   int32_t *const b) {
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
@@ -1468,7 +1468,7 @@ static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
   _mm256_store_si256((__m256i *)(b + 8), b_hi);
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+static inline void store_rect_buffer_16bit_to_32bit_w16_avx2(
     const __m256i *const in, int32_t *const out, const int stride,
     const int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -1593,7 +1593,7 @@ static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
   fadst8x8_new_sse2       // H_FLIPADST
 };
 
-static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
+static inline void load_buffer_and_round_shift(const int16_t *in, int stride,
                                                __m128i *out, int bit) {
   out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
   out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
@@ -1613,7 +1613,7 @@ static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
   out[7] = _mm_slli_epi16(out[7], bit);
 }
 
-static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
+static inline void load_buffer_and_flip_round_shift(const int16_t *in,
                                                     int stride, __m128i *out,
                                                     int bit) {
   out[7] = load_16bit_to_16bit(in + 0 * stride);
@@ -1663,7 +1663,7 @@ static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
     c3 = _mm256_permute4x64_epi64(bb3, 0xd8);                        \
   }
 
-static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
+static inline void transpose_round_shift_flip_8x8(__m128i *const in,
                                                   __m128i *const out, int bit) {
   __m256i c0, c1, c2, c3;
   bit = -bit;
@@ -1712,7 +1712,7 @@ static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
   out[0] = _mm256_extractf128_si256(c3, 1);
 }
 
-static INLINE void transpose_round_shift_8x8(__m128i *const in,
+static inline void transpose_round_shift_8x8(__m128i *const in,
                                              __m128i *const out, int bit) {
   __m256i c0, c1, c2, c3;
   bit = -bit;
@@ -1760,7 +1760,7 @@ static INLINE void transpose_round_shift_8x8(__m128i *const in,
   out[7] = _mm256_extractf128_si256(c3, 1);
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
+static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
                                                        int32_t *const out,
                                                        const int stride,
                                                        const int out_size) {
@@ -2192,7 +2192,7 @@ static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
 }
 
-static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
                                __m256i *in1, __m128i *out0, __m128i *out1,
                                __m128i *out2, __m128i *out3,
                                const __m256i *__rounding, int8_t *cos_bit) {
@@ -2222,7 +2222,7 @@ static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
   *out3 = _mm256_extracti128_si256(temp1, 0x01);
 }
 
-static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -2301,7 +2301,7 @@ static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
   output[7] = x4[7];
 }
 
-static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __zero = _mm256_setzero_si256();
@@ -2414,7 +2414,7 @@ static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
   output[7] = x6[0];
 }
 
-static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
                                          int8_t cos_bit) {
   (void)cos_bit;
 
@@ -2428,7 +2428,7 @@ static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
   output[7] = _mm256_adds_epi16(input[7], input[7]);
 }
 
-static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
@@ -2577,7 +2577,7 @@ static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
               &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
 }
 
-static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m256i __zero = _mm256_setzero_si256();
@@ -2794,7 +2794,7 @@ static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
               &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
 }
 
-static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
                                           int8_t cos_bit) {
   (void)cos_bit;
   const __m256i one = _mm256_set1_epi16(1);
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index cae2fc7316..335503c2be 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -19,7 +19,7 @@
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
 
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+static inline void int16_array_with_stride_to_int32_array_without_stride(
     const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
   int r, c;
   for (r = 0; r < txfm1d_size; r++) {
@@ -29,7 +29,7 @@ static INLINE void int16_array_with_stride_to_int32_array_without_stride(
   }
 }
 
-static INLINE void store_output_32bit_w8(int32_t *const out,
+static inline void store_output_32bit_w8(int32_t *const out,
                                          const __m128i *const in1,
                                          const __m128i *const in2,
                                          const int stride, const int out_size) {
@@ -73,7 +73,7 @@ static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
   }
 }
 
-static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT32: return fdct32_sse4_1;
     case TXFM_TYPE_DCT64: return fdct64_new_sse4_1;
@@ -83,7 +83,7 @@ static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   return NULL;
 }
 
-static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
                                      const int stride,
                                      const TXFM_2D_FLIP_CFG *cfg,
                                      int32_t *txfm_buf) {
@@ -117,7 +117,7 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
   av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
 }
 
-static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+static inline void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
                                            int32_t *output, const int stride,
                                            const TXFM_2D_FLIP_CFG *cfg,
                                            int32_t *txfm_buf) {
diff --git a/av1/encoder/x86/av1_fwd_txfm_avx2.h b/av1/encoder/x86/av1_fwd_txfm_avx2.h
index 56647a090e..81721a836a 100644
--- a/av1/encoder/x86/av1_fwd_txfm_avx2.h
+++ b/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -15,7 +15,7 @@
 
 // out0 = in0*w0 + in1*w1
 // out1 = -in1*w0 + in0*w1
-static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+static inline void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
                                      __m256i *in0, __m256i *in1,
                                      const __m256i _r, const int32_t cos_bit) {
   __m256i _in0 = *in0;
@@ -34,7 +34,7 @@ static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
   *in1 = _mm256_srai_epi32(temp1, cos_bit);
 }
 
-static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+static inline void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
                                      __m256i *in0, __m256i *in1,
                                      const __m256i _r, const int32_t cos_bit) {
   __m256i _in0 = *in0;
@@ -55,7 +55,7 @@ static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
 
 // out0 = in0*w0 + in1*w1
 // out1 = -in1*w0 + in0*w1
-static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+static inline void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
                                          __m256i *in0, __m256i *in1,
                                          const __m256i _r,
                                          const int32_t cos_bit) {
@@ -75,7 +75,7 @@ static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
 
 // out0 = in0*w0 + in1*w1
 // out1 = in1*w0 - in0*w1
-static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+static inline void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
                                          __m256i *in0, __m256i *in1,
                                          const __m256i _r,
                                          const int32_t cos_bit) {
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.h b/av1/encoder/x86/av1_fwd_txfm_sse2.h
index 68a6a09029..5420dc5ef3 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -29,7 +29,7 @@ void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
 void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
                            int8_t cos_bit);
 
-static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+static inline void fidentity4x4_new_sse2(const __m128i *const input,
                                          __m128i *const output,
                                          const int8_t cos_bit) {
   (void)cos_bit;
@@ -42,7 +42,7 @@ static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
   }
 }
 
-static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+static inline void fidentity8x4_new_sse2(const __m128i *const input,
                                          __m128i *const output,
                                          const int8_t cos_bit) {
   (void)cos_bit;
@@ -57,7 +57,7 @@ static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
   }
 }
 
-static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+static inline void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
                                          int8_t cos_bit) {
   (void)cos_bit;
 
@@ -71,7 +71,7 @@ static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
   output[7] = _mm_adds_epi16(input[7], input[7]);
 }
 
-static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+static inline void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
@@ -125,7 +125,7 @@ static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
   btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]);
 }
 
-static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+static inline void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __zero = _mm_setzero_si128();
@@ -205,7 +205,7 @@ static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
               output[6]);
 }
 
-static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+static inline void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
                                           int8_t cos_bit) {
   (void)cos_bit;
   const __m128i one = _mm_set1_epi16(1);
@@ -219,7 +219,7 @@ static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
   }
 }
 
-static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+static inline void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
                                           int8_t cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 32; ++i) {
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c
index d43e4a7242..fb3d3c2e31 100644
--- a/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -16,20 +16,20 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+static inline void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i dc = _mm_unpacklo_epi16(*p, zero);
   const __m128i ac = _mm_unpackhi_epi16(*p, zero);
   *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
 }
 
-static INLINE void update_qp(__m256i *qp) {
+static inline void update_qp(__m256i *qp) {
   qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
   qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
   qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
 }
 
-static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+static inline void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
                            const int16_t *dequant_ptr, int log_scale,
                            __m256i *qp) {
   __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
@@ -45,7 +45,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
   init_one_qp(&dequant, &qp[2]);
 }
 
-static INLINE void quantize(const __m256i *qp, __m256i *c,
+static inline void quantize(const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, int log_scale,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
                             __m256i *eob) {
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 7873a8f64a..80e0933fb6 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -19,7 +19,7 @@
 
 // Coefficient quantization phase 1
 // param[0-2] : rounding/quan/dequan constants
-static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
                                          const int shift, const int scale,
                                          __m128i *qcoeff, __m128i *dquan,
                                          __m128i *sign) {
@@ -43,7 +43,7 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
 }
 
 // Coefficient quantization phase 2
-static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
                                          const __m128i *sign,
                                          const __m128i *param, const int shift,
                                          const int scale, tran_low_t *qAddr,
@@ -80,7 +80,7 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
 }
 
-static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
                             __m128i *eob) {
   const __m128i zero = _mm_setzero_si128();
   __m128i mask, iscanIdx;
@@ -99,7 +99,7 @@ static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
   *eob = _mm_max_epi16(*eob, iscanIdx);
 }
 
-static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+static inline uint16_t get_accumulated_eob(__m128i *eob) {
   __m128i eob_shuffled;
   uint16_t eobValue;
   eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 19e8694ab7..d890a3541f 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -16,18 +16,18 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-static INLINE void write_zero(tran_low_t *qcoeff) {
+static inline void write_zero(tran_low_t *qcoeff) {
   const __m256i zero = _mm256_setzero_si256();
   _mm256_storeu_si256((__m256i *)qcoeff, zero);
   _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
 }
 
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+static inline void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i ac = _mm_unpackhi_epi64(*p, *p);
   *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
 }
 
-static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+static inline void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
                            const int16_t *dequant_ptr, int log_scale,
                            __m256i *thr, __m256i *qp) {
   __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
@@ -54,20 +54,20 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
   *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1));
 }
 
-static INLINE void update_qp(__m256i *thr, __m256i *qp) {
+static inline void update_qp(__m256i *thr, __m256i *qp) {
   qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
   qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
   qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
   *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11);
 }
 
-static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
   const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
   const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
   return _mm256_packs_epi32(coeff1, coeff2);
 }
 
-static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+static inline void store_coefficients_avx2(__m256i coeff_vals,
                                            tran_low_t *coeff_ptr) {
   __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
   __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
@@ -76,7 +76,7 @@ static INLINE void store_coefficients_avx2(__m256i coeff_vals,
   _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
 }
 
-static INLINE uint16_t quant_gather_eob(__m256i eob) {
+static inline uint16_t quant_gather_eob(__m256i eob) {
   const __m128i eob_lo = _mm256_castsi256_si128(eob);
   const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
   __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
@@ -85,7 +85,7 @@ static INLINE uint16_t quant_gather_eob(__m256i eob) {
   return INT16_MAX - _mm_extract_epi16(eob_s, 0);
 }
 
-static INLINE int16_t accumulate_eob256(__m256i eob256) {
+static inline int16_t accumulate_eob256(__m256i eob256) {
   const __m128i eob_lo = _mm256_castsi256_si128(eob256);
   const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
   __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
@@ -331,7 +331,7 @@ void av1_quantize_fp_32x32_avx2(
   *eob_ptr = quant_gather_eob(eob);
 }
 
-static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp,
+static inline void quantize_fp_64x64(const __m256i *thr, const __m256i *qp,
                                      const tran_low_t *coeff_ptr,
                                      const int16_t *iscan_ptr,
                                      tran_low_t *qcoeff_ptr,
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index a933db9270..09f083069d 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -17,7 +17,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+static inline void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
   const tran_low_t *addr = coeff + offset;
   if (sizeof(tran_low_t) == 4) {
@@ -33,7 +33,7 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
   }
 }
 
-static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+static inline void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
                                 tran_low_t *qcoeff, intptr_t offset) {
   tran_low_t *addr = qcoeff + offset;
   if (sizeof(tran_low_t) == 4) {
@@ -55,7 +55,7 @@ static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
   }
 }
 
-static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+static inline void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   const __m128i zero = _mm_setzero_si128();
   tran_low_t *addr = qcoeff + offset;
   if (sizeof(tran_low_t) == 4) {
@@ -69,7 +69,7 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   }
 }
 
-static INLINE void quantize(const int16_t *iscan_ptr,
+static inline void quantize(const int16_t *iscan_ptr,
                             const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const __m128i *round0, const __m128i *round1,
@@ -189,7 +189,7 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   }
 }
 
-static INLINE void quantize_lp(const int16_t *iscan_ptr,
+static inline void quantize_lp(const int16_t *iscan_ptr,
                                const int16_t *coeff_ptr, intptr_t n_coeffs,
                                int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                                const __m128i *round0, const __m128i *round1,
diff --git a/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/av1/encoder/x86/av1_temporal_denoiser_sse2.c
index daf63a126e..23facf07b3 100644
--- a/av1/encoder/x86/av1_temporal_denoiser_sse2.c
+++ b/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -20,7 +20,7 @@
 #include "av1/encoder/av1_temporal_denoiser.h"
 
 // Compute the sum of all pixel differences of this MB.
-static INLINE int sum_diff_16x1(__m128i acc_diff) {
+static inline int sum_diff_16x1(__m128i acc_diff) {
   const __m128i k_1 = _mm_set1_epi16(1);
   const __m128i acc_diff_lo =
       _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
@@ -36,7 +36,7 @@ static INLINE int sum_diff_16x1(__m128i acc_diff) {
 }
 
 // Denoise a 16x1 vector.
-static INLINE __m128i av1_denoiser_16x1_sse2(
+static inline __m128i av1_denoiser_16x1_sse2(
     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
     const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
     const __m128i *k_16, const __m128i *l3, const __m128i *l32,
@@ -87,7 +87,7 @@ static INLINE __m128i av1_denoiser_16x1_sse2(
 }
 
 // Denoise a 16x1 vector with a weaker filter.
-static INLINE __m128i av1_denoiser_adj_16x1_sse2(
+static inline __m128i av1_denoiser_adj_16x1_sse2(
     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
     const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
   __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
diff --git a/av1/encoder/x86/av1_txfm1d_sse4.h b/av1/encoder/x86/av1_txfm1d_sse4.h
index 22638cd067..e3f714ede8 100644
--- a/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -58,7 +58,7 @@ void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
 void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
                        const int col_num);
 
-static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+static inline void transpose_32_4x4(int stride, const __m128i *input,
                                     __m128i *output) {
   __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
   __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
@@ -75,7 +75,7 @@ static INLINE void transpose_32_4x4(int stride, const __m128i *input,
 // each 4x4 blocks can be represent by 4 vertical __m128i
 // we first transpose each 4x4 block internally
 // then transpose the grid
-static INLINE void transpose_32(int txfm_size, const __m128i *input,
+static inline void transpose_32(int txfm_size, const __m128i *input,
                                 __m128i *output) {
   const int num_per_128 = 4;
   const int row_size = txfm_size;
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c
index 59f6116e0f..01f04c31ec 100644
--- a/av1/encoder/x86/cnn_avx2.c
+++ b/av1/encoder/x86/cnn_avx2.c
@@ -63,7 +63,7 @@ DECLARE_ALIGNED(32, static const uint32_t,
 
 // Load weights needed for layer 0 (for 5x5 block processing),
 // and fill the registers appropriately to match source pixel mapping.
-static INLINE void prepare_weights_for_5x5_convolve(
+static inline void prepare_weights_for_5x5_convolve(
     const float *layer_config_weights, int off, float weight[5][8],
     const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0,
     const __m256i weight_mask_1) {
@@ -119,7 +119,7 @@ static INLINE void prepare_weights_for_5x5_convolve(
   } while (0)
 
 // Load masks needed for shuffling of output and weights.
-static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
+static inline void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
                                                        __m256i *weight_mask) {
   // Load shuffle buffer needed to sort the output.
   *output_mask =
@@ -134,7 +134,7 @@ static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
 
 // Load weights needed for layer 1 and 2 (for 2x2 block processing),
 // and fill the registers appropriately to match source pixel mapping.
-static INLINE void prepare_weights_for_2x2_convolve(
+static inline void prepare_weights_for_2x2_convolve(
     const float *layer_config_weights, int off, const int cstep,
     __m256 *shuffle_weight, __m256i *weight_mask) {
   // Weights needed for 2x2 block.
@@ -182,7 +182,7 @@ static INLINE void prepare_weights_for_2x2_convolve(
   } while (0)
 
 // Do convolution on 8 horizontal 2x2 blocks.
-static INLINE void perform_convolve_for_8h_2x2_blocks(
+static inline void perform_convolve_for_8h_2x2_blocks(
     const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
     __m256i shuffle_output_mask) {
   __m256 load_src[4];
@@ -209,7 +209,7 @@ static INLINE void perform_convolve_for_8h_2x2_blocks(
 }
 
 // Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks.
-static INLINE void perform_convolve_for_4hx2v_2x2_blocks(
+static inline void perform_convolve_for_4hx2v_2x2_blocks(
     const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
     __m256i shuffle_output_mask) {
   __m256 load_src[4];
@@ -363,7 +363,7 @@ static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
 }
 
 // AVX2 implementation for layer 1.
-static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+static inline void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
     const float **input, int in_stride,
     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
     int start_idx, const int cstep, const int channel_step) {
@@ -410,7 +410,7 @@ static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
 }
 
 // AVX2 implementation for layer 2.
-static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+static inline void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
     const float **input, int in_stride,
     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
     int start_idx, const int cstep, const int channel_step) {
diff --git a/av1/encoder/x86/encodetxb_sse2.c b/av1/encoder/x86/encodetxb_sse2.c
index 607aaa66d2..6ef4bc435e 100644
--- a/av1/encoder/x86/encodetxb_sse2.c
+++ b/av1/encoder/x86/encodetxb_sse2.c
@@ -17,7 +17,7 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 
-static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+static inline void load_levels_4x4x5_sse2(const uint8_t *const src,
                                           const int stride,
                                           const ptrdiff_t *const offsets,
                                           __m128i *const level) {
@@ -28,7 +28,7 @@ static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
   level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
 }
 
-static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+static inline void load_levels_8x2x5_sse2(const uint8_t *const src,
                                           const int stride,
                                           const ptrdiff_t *const offsets,
                                           __m128i *const level) {
@@ -39,7 +39,7 @@ static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
   level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
 }
 
-static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+static inline void load_levels_16x1x5_sse2(const uint8_t *const src,
                                            const int stride,
                                            const ptrdiff_t *const offsets,
                                            __m128i *const level) {
@@ -50,7 +50,7 @@ static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
   level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
 }
 
-static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+static inline __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
   const __m128i const_3 = _mm_set1_epi8(3);
   const __m128i const_4 = _mm_set1_epi8(4);
   __m128i count;
@@ -69,7 +69,7 @@ static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
   return count;
 }
 
-static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             int8_t *const coeff_contexts) {
@@ -101,7 +101,7 @@ static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
                                              const int width,
                                              const ptrdiff_t *const offsets,
                                              int8_t *coeff_contexts) {
@@ -132,7 +132,7 @@ static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
                                              const int width,
                                              const ptrdiff_t *const offsets,
                                              int8_t *coeff_contexts) {
@@ -165,7 +165,7 @@ static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
                                            const int width,
                                            const ptrdiff_t *const offsets,
                                            int8_t *coeff_contexts) {
@@ -211,7 +211,7 @@ static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             int8_t *coeff_contexts) {
@@ -242,7 +242,7 @@ static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             int8_t *coeff_contexts) {
@@ -275,7 +275,7 @@ static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
   } while (col);
 }
 
-static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
                                              const int real_width,
                                              const int real_height,
                                              const int width, const int height,
@@ -348,7 +348,7 @@ static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
   coeff_contexts[0] = 0;
 }
 
-static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               int8_t *coeff_contexts) {
@@ -395,7 +395,7 @@ static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
   } while (--col);
 }
 
-static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               int8_t *coeff_contexts) {
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 7bc4ee1bae..a389fb87f2 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -15,7 +15,7 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+static inline void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m256i *c) {
   const tran_low_t *addr = coeff + offset;
 
@@ -29,7 +29,7 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
   }
 }
 
-static INLINE void av1_block_error_block_size16_avx2(const int16_t *coeff,
+static inline void av1_block_error_block_size16_avx2(const int16_t *coeff,
                                                      const int16_t *dqcoeff,
                                                      __m256i *sse_256) {
   const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
@@ -44,7 +44,7 @@ static INLINE void av1_block_error_block_size16_avx2(const int16_t *coeff,
   *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
 }
 
-static INLINE void av1_block_error_block_size32_avx2(const int16_t *coeff,
+static inline void av1_block_error_block_size32_avx2(const int16_t *coeff,
                                                      const int16_t *dqcoeff,
                                                      __m256i *sse_256) {
   const __m256i zero = _mm256_setzero_si256();
@@ -71,7 +71,7 @@ static INLINE void av1_block_error_block_size32_avx2(const int16_t *coeff,
   *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
 }
 
-static INLINE void av1_block_error_block_size64_avx2(const int16_t *coeff,
+static inline void av1_block_error_block_size64_avx2(const int16_t *coeff,
                                                      const int16_t *dqcoeff,
                                                      __m256i *sse_256,
                                                      intptr_t block_size) {
diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 2fd6d1d289..aa35723cf1 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+static inline void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
                                         int stride, int flipud, int fliplr,
                                         int shift) {
   __m128i out1[8];
@@ -73,7 +73,7 @@ static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
   out[6] = _mm256_slli_epi32(out[6], shift);
   out[7] = _mm256_slli_epi32(out[7], shift);
 }
-static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+static inline void col_txfm_8x8_rounding(__m256i *in, int shift) {
   const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
 
   in[0] = _mm256_add_epi32(in[0], rounding);
@@ -94,7 +94,7 @@ static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
   in[6] = _mm256_srai_epi32(in[6], shift);
   in[7] = _mm256_srai_epi32(in[7], shift);
 }
-static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+static inline void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
                                          int stride, int flipud, int fliplr,
                                          int shift) {
   const int16_t *topL = input;
@@ -110,7 +110,7 @@ static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
   load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
   load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
 }
-static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+static inline void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
                                          int stride, int height, int outstride,
                                          int flipud, int fliplr) {
   __m256i out1[64];
@@ -179,7 +179,7 @@ static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
   out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
   out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
-static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+static inline void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
                                            int stride) {
   if (bit < 0) {
     bit = -bit;
@@ -194,14 +194,14 @@ static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
     }
   }
 }
-static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+static inline void store_buffer_avx2(const __m256i *const in, int32_t *out,
                                      const int stride, const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     _mm256_store_si256((__m256i *)(out), in[i]);
     out += stride;
   }
 }
-static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+static inline void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
                                                  __m256i *out) {
   fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
   fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
@@ -209,7 +209,7 @@ static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
   fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
 }
 
-static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+static inline __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
                                         const __m256i *w1, const __m256i *n1,
                                         const __m256i *rounding, int bit) {
   __m256i x, y;
@@ -1581,7 +1581,7 @@ void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
   }
   (void)bd;
 }
-static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+static inline void fdct32_avx2(__m256i *input, __m256i *output,
                                const int8_t cos_bit, const int instride,
                                const int outstride) {
   __m256i buf0[32];
@@ -1965,7 +1965,7 @@ static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
   output[startidx] = buf0[30];
   output[endidx] = buf0[1];
 }
-static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+static inline void idtx32x32_avx2(__m256i *input, __m256i *output,
                                   const int8_t cos_bit, int instride,
                                   int outstride) {
   (void)cos_bit;
@@ -2073,7 +2073,7 @@ void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
 
   store_buffer_avx2(buf1, output, 8, 128);
 }
-static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+static inline void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
                                       __m256i *cospi_m32, __m256i *cospi_p32,
                                       const __m256i *__rounding,
                                       int8_t cos_bit) {
@@ -2142,7 +2142,7 @@ static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
   x2[62] = x1[62];
   x2[63] = x1[63];
 }
-static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+static inline void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
                                       __m256i *cospi_m32, __m256i *cospi_p32,
                                       const __m256i *__rounding,
                                       int8_t cos_bit) {
@@ -2211,7 +2211,7 @@ static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
   x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
   x3[56] = _mm256_add_epi32(x2[56], x2[55]);
 }
-static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+static inline void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
                                       __m256i *cospi_m32, __m256i *cospi_p32,
                                       __m256i *cospi_m16, __m256i *cospi_p48,
                                       __m256i *cospi_m48,
@@ -2282,7 +2282,7 @@ static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
   x4[62] = x3[62];
   x4[63] = x3[63];
 }
-static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+static inline void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
                                       __m256i *cospi_m32, __m256i *cospi_p32,
                                       __m256i *cospi_m16, __m256i *cospi_p48,
                                       __m256i *cospi_m48,
@@ -2353,7 +2353,7 @@ static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
   x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
   x5[60] = _mm256_add_epi32(x4[60], x4[59]);
 }
-static INLINE void fdct64_stage6_avx2(
+static inline void fdct64_stage6_avx2(
     __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
     __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
     __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
@@ -2424,7 +2424,7 @@ static INLINE void fdct64_stage6_avx2(
   x6[62] = x5[62];
   x6[63] = x5[63];
 }
-static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+static inline void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
                                       __m256i *cospi_p08, __m256i *cospi_p56,
                                       __m256i *cospi_p40, __m256i *cospi_p24,
                                       __m256i *cospi_m08, __m256i *cospi_m56,
@@ -2496,7 +2496,7 @@ static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
   x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
   x7[62] = _mm256_add_epi32(x6[62], x6[61]);
 }
-static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+static inline void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
                                       const int32_t *cospi,
                                       const __m256i *__rounding,
                                       int8_t cos_bit) {
@@ -2583,7 +2583,7 @@ static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
   x8[60] = x7[60];
   x8[63] = x7[63];
 }
-static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+static inline void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
                                       const int32_t *cospi,
                                       const __m256i *__rounding,
                                       int8_t cos_bit) {
@@ -2669,7 +2669,7 @@ static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
   x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
   x9[63] = _mm256_add_epi32(x8[63], x8[62]);
 }
-static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+static inline void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
                                        const int32_t *cospi,
                                        const __m256i *__rounding,
                                        int8_t cos_bit) {
diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 0e5d6923e2..55668e9d07 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -22,7 +22,7 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
-static INLINE void store_output_w4(int32_t *const out, const __m128i *const in,
+static inline void store_output_w4(int32_t *const out, const __m128i *const in,
                                    const int stride, const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     _mm_store_si128((__m128i *)(out + i * stride), in[i]);
@@ -80,7 +80,7 @@ void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
   _mm_storeu_si128((__m128i *)(output + 12), op[3]);
 }
 
-static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+static inline void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride, int flipud, int fliplr,
                                    int shift) {
   if (!flipud) {
@@ -170,7 +170,7 @@ static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
   out[3] = u3;
 }
 
-static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+static inline void write_buffer_4x4(__m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@@ -371,7 +371,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
   (void)bd;
 }
 
-static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+static inline void load_buffer_8x8(const int16_t *input, __m128i *in,
                                    int stride, int flipud, int fliplr,
                                    int shift) {
   __m128i u;
@@ -457,7 +457,7 @@ static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
   in[15] = _mm_slli_epi32(in[15], shift);
 }
 
-static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+static inline void col_txfm_8x8_rounding(__m128i *in, int shift) {
   const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
 
   in[0] = _mm_add_epi32(in[0], rounding);
@@ -495,7 +495,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
   in[15] = _mm_srai_epi32(in[15], shift);
 }
 
-static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
+static inline void col_txfm_4x8_rounding(__m128i *in, int shift) {
   const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
 
   in[0] = _mm_add_epi32(in[0], rounding);
@@ -517,7 +517,7 @@ static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
   in[7] = _mm_srai_epi32(in[7], shift);
 }
 
-static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+static inline void write_buffer_8x8(const __m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@@ -539,7 +539,7 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
 }
 
-static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+static inline void write_buffer_16x8(const __m128i *res, int32_t *output,
                                      const int stride) {
   _mm_storeu_si128((__m128i *)(output), res[0]);
   _mm_storeu_si128((__m128i *)(output + 4), res[1]);
@@ -1036,7 +1036,7 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
 
 // Hybrid Transform 16x16
 
-static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
   int row_index = 0;
   int dst_index = 0;
   int src_index = 0;
@@ -1065,7 +1065,7 @@ static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
   } while (row_index < 16);
 }
 
-static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+static inline void load_buffer_16x16(const int16_t *input, __m128i *out,
                                      int stride, int flipud, int fliplr,
                                      int shift) {
   __m128i in[64];
@@ -1110,7 +1110,7 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
   convert_8x8_to_16x16(in, out);
 }
 
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+static inline void load_buffer_8x16(const int16_t *input, __m128i *out,
                                     int stride, int flipud, int fliplr,
                                     int shift) {
   const int16_t *topL = input;
@@ -1128,7 +1128,7 @@ static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
   load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
 }
 
-static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
+static inline void load_buffer_8x4(const int16_t *input, __m128i *out,
                                    int stride, int flipud, int fliplr,
                                    int shift) {
   const int16_t *topL = input;
@@ -1146,7 +1146,7 @@ static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
   load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
 }
 
-static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
+static inline void load_buffer_16x4(const int16_t *input, __m128i *out,
                                     int stride, int flipud, int fliplr,
                                     int shift) {
   const int16_t *topL = input;
@@ -1164,7 +1164,7 @@ static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
   load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
 }
 
-static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
+static inline void load_buffer_4x8(const int16_t *input, __m128i *out,
                                    int stride, int flipud, int fliplr,
                                    int shift) {
   const int16_t *topL = input;
@@ -1183,7 +1183,7 @@ static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
+static inline void load_buffer_4x16(const int16_t *input, __m128i *out,
                                     const int stride, const int flipud,
                                     const int fliplr, const int shift) {
   const int16_t *topL = input;
@@ -1201,7 +1201,7 @@ static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
 }
 #endif
 
-static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
+static inline void load_buffer_32x8n(const int16_t *input, __m128i *out,
                                      int stride, int flipud, int fliplr,
                                      int shift, const int height) {
   const int16_t *in = input;
@@ -1942,7 +1942,7 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
   (void)bd;
 }
 
-static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
   for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
   for (int i = 1; i < size; i += 2) in[size - i] = out[i];
 }
diff --git a/av1/encoder/x86/ml_avx2.c b/av1/encoder/x86/ml_avx2.c
index bed8f6d1ad..cb0a079986 100644
--- a/av1/encoder/x86/ml_avx2.c
+++ b/av1/encoder/x86/ml_avx2.c
@@ -25,7 +25,7 @@
   const __m256 mul1 = _mm256_mul_ps(inputs256, weight1);                    \
   hadd[i] = _mm256_hadd_ps(mul0, mul1);
 
-static INLINE void nn_propagate_8to1(
+static inline void nn_propagate_8to1(
     const float *const inputs, const float *const weights,
     const float *const bias, int num_inputs_to_process, int tot_num_inputs,
     int num_outputs, float *const output_nodes, int is_clip_required) {
@@ -53,7 +53,7 @@ static INLINE void nn_propagate_8to1(
   }
 }
 
-static INLINE void nn_propagate_8to4(
+static inline void nn_propagate_8to4(
     const float *const inputs, const float *const weights,
     const float *const bias, int num_inputs_to_process, int tot_num_inputs,
     int num_outputs, float *const output_nodes, int is_clip_required) {
@@ -83,7 +83,7 @@ static INLINE void nn_propagate_8to4(
   }
 }
 
-static INLINE void nn_propagate_8to8(
+static inline void nn_propagate_8to8(
     const float *const inputs, const float *const weights,
     const float *const bias, int num_inputs_to_process, int tot_num_inputs,
     int num_outputs, float *const output_nodes, int is_clip_required) {
@@ -114,7 +114,7 @@ static INLINE void nn_propagate_8to8(
   }
 }
 
-static INLINE void nn_propagate_input_multiple_of_8(
+static inline void nn_propagate_input_multiple_of_8(
     const float *const inputs, const float *const weights,
     const float *const bias, int num_inputs_to_process, int tot_num_inputs,
     bool is_output_layer, int num_outputs, float *const output_nodes) {
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 1c5439fac0..56cbfe44ec 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -20,7 +20,7 @@
 #include "av1/encoder/pickrst.h"
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
+static inline void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
                                         const __m256i *shuffle,
                                         const __m256i *dgd_ijkl) {
   // Load two 128-bit chunks from dgd
@@ -55,7 +55,7 @@ static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
   yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
 }
 
-static INLINE void acc_stat_highbd_win7_one_line_avx2(
+static inline void acc_stat_highbd_win7_one_line_avx2(
     const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
     int dgd_stride, const __m256i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
@@ -145,7 +145,7 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
   }
 }
 
-static INLINE void compute_stats_highbd_win7_opt_avx2(
+static inline void compute_stats_highbd_win7_opt_avx2(
     const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
     int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
     int64_t *H, aom_bit_depth_t bit_depth) {
@@ -202,7 +202,7 @@ static INLINE void compute_stats_highbd_win7_opt_avx2(
   }
 }
 
-static INLINE void acc_stat_highbd_win5_one_line_avx2(
+static inline void acc_stat_highbd_win5_one_line_avx2(
     const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
     int dgd_stride, const __m256i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
@@ -285,7 +285,7 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
   }
 }
 
-static INLINE void compute_stats_highbd_win5_opt_avx2(
+static inline void compute_stats_highbd_win5_opt_avx2(
     const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
     int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
     int64_t *H, aom_bit_depth_t bit_depth) {
@@ -370,17 +370,17 @@ void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
+static inline void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd));
 }
 
-static INLINE __m256i convert_and_add_avx2(__m256i src) {
+static inline __m256i convert_and_add_avx2(__m256i src) {
   const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
   const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
   return _mm256_add_epi64(s0, s1);
 }
 
-static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
+static inline __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
                                               __m256i *src2, __m256i *src3) {
   // 00 01 10 11 02 03 12 13
   const __m256i s_0 = _mm256_hadd_epi32(src0, src1);
@@ -391,7 +391,7 @@ static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
   return convert_and_add_avx2(s_2);
 }
 
-static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
+static inline __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
   // 00 10 02 12
   const __m256i t0 = _mm256_unpacklo_epi64(src0, src1);
   // 01 11 03 13
@@ -406,7 +406,7 @@ static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
   return _mm_add_epi64(sum0, sum1);
 }
 
-static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
+static inline __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
   // 00 01 02 03
   const __m256i s0 = convert_and_add_avx2(src0);
   // 10 11 12 13
@@ -414,7 +414,7 @@ static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
   return add_64bit_lvl_avx2(s0, s1);
 }
 
-static INLINE int32_t calc_sum_of_register(__m256i src) {
+static inline int32_t calc_sum_of_register(__m256i src) {
   const __m128i src_l = _mm256_castsi256_si128(src);
   const __m128i src_h = _mm256_extracti128_si256(src, 1);
   const __m128i sum = _mm_add_epi32(src_l, src_h);
@@ -423,7 +423,7 @@ static INLINE int32_t calc_sum_of_register(__m256i src) {
   return _mm_cvtsi128_si32(dst1);
 }
 
-static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src,
+static inline void transpose_64bit_4x4_avx2(const __m256i *const src,
                                             __m256i *const dst) {
   // Unpack 64 bit elements. Goes from:
   // src[0]: 00 01 02 03
@@ -465,7 +465,7 @@ static const int16_t mask_16bit[32] = {
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   // 16 bytes
 };
 
-static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
+static inline uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
                                             int32_t h_end, int32_t v_start,
                                             int32_t v_end, int32_t stride) {
   const uint8_t *src_temp = src + v_start * stride + h_start;
@@ -518,7 +518,7 @@ static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
 
 // Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not
 // 0, it writes (16 - n) more data than required.
-static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
+static inline void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
                                       uint8_t avg, int32_t width,
                                       int32_t height, int16_t *dst,
                                       int32_t dst_stride,
@@ -551,7 +551,7 @@ static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
 
 // Fills lower-triangular elements of H buffer from upper triangular elements of
 // the same
-static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
+static inline void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
                                                   int64_t *const H) {
   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
     __m256i in[4], out[4];
@@ -1540,7 +1540,7 @@ void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
   }
 }
 
-static INLINE __m256i pair_set_epi16(int a, int b) {
+static inline __m256i pair_set_epi16(int a, int b) {
   return _mm256_set1_epi32(
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index f52d803358..0e155afcc7 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -18,7 +18,7 @@
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
 
-static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+static inline void acc_stat_sse41(int32_t *dst, const uint8_t *src,
                                   const __m128i *shuffle, const __m128i *kl) {
   const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
   const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
@@ -32,7 +32,7 @@ static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
   xx_storeu_128(dst + 4, r1);
 }
 
-static INLINE void acc_stat_win7_one_line_sse4_1(
+static inline void acc_stat_win7_one_line_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
@@ -105,7 +105,7 @@ static INLINE void acc_stat_win7_one_line_sse4_1(
   }
 }
 
-static INLINE void compute_stats_win7_opt_sse4_1(
+static inline void compute_stats_win7_opt_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
     int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
     int use_downsampled_wiener_stats) {
@@ -194,7 +194,7 @@ static INLINE void compute_stats_win7_opt_sse4_1(
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
+static inline void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
                                          const __m128i *shuffle,
                                          const __m128i *dgd_ijkl) {
   // Load 256 bits from dgd in two chunks
@@ -234,7 +234,7 @@ static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
   xx_storeu_128(dst + 6, rhh);
 }
 
-static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
+static inline void acc_stat_highbd_win7_one_line_sse4_1(
     const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
@@ -324,7 +324,7 @@ static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
   }
 }
 
-static INLINE void compute_stats_highbd_win7_opt_sse4_1(
+static inline void compute_stats_highbd_win7_opt_sse4_1(
     const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
     int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
     int64_t *H, aom_bit_depth_t bit_depth) {
@@ -382,7 +382,7 @@ static INLINE void compute_stats_highbd_win7_opt_sse4_1(
   }
 }
 
-static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
+static inline void acc_stat_highbd_win5_one_line_sse4_1(
     const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
@@ -465,7 +465,7 @@ static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
   }
 }
 
-static INLINE void compute_stats_highbd_win5_opt_sse4_1(
+static inline void compute_stats_highbd_win5_opt_sse4_1(
     const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
     int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
     int64_t *H, aom_bit_depth_t bit_depth) {
@@ -549,7 +549,7 @@ void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static INLINE void acc_stat_win5_one_line_sse4_1(
+static inline void acc_stat_win5_one_line_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
     int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
@@ -619,7 +619,7 @@ static INLINE void acc_stat_win5_one_line_sse4_1(
   }
 }
 
-static INLINE void compute_stats_win5_opt_sse4_1(
+static inline void compute_stats_win5_opt_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
     int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
     int use_downsampled_wiener_stats) {
@@ -730,7 +730,7 @@ void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
   }
 }
 
-static INLINE __m128i pair_set_epi16(int a, int b) {
+static inline __m128i pair_set_epi16(int a, int b) {
   return _mm_set1_epi32(
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
 }
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index 96798576ef..d604fde009 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -21,7 +21,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+inline static void horver_correlation_4x4(const int16_t *diff, int stride,
                                           __m256i *xy_sum_32,
                                           __m256i *xz_sum_32, __m256i *x_sum_32,
                                           __m256i *x2_sum_32) {
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 6258fb0f7d..20877b5649 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -20,7 +20,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+inline static void horver_correlation_4x4(const int16_t *diff, int stride,
                                           __m128i *xy_sum_32,
                                           __m128i *xz_sum_32, __m128i *x_sum_32,
                                           __m128i *x2_sum_32) {
diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index b8cbe0ca08..6251606e6c 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -148,7 +148,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+static inline void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
                                                     const __m128i *w0,
                                                     const __m128i *w1,
                                                     const __m128i *r,
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
index f34efb6db5..f31c0eaa7e 100644
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -19,7 +19,7 @@
 
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+static inline void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
                                         void *const result) {
   __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
index 3e2e3835df..d70793cec9 100644
--- a/av1/encoder/x86/wedge_utils_sse2.c
+++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -184,7 +184,7 @@ int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
 }
 
 // Negate under mask
-static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+static inline __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
   return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
 }
 
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 02a5f9f1c4..2f11828ff1 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -20,8 +20,6 @@ include("${AOM_ROOT}/build/cmake/util.cmake")
 # in this file.
 #
 
-set_aom_detect_var(INLINE "inline" "Sets INLINE value for current target.")
-
 # CPUs.
 set_aom_detect_var(AOM_ARCH_AARCH64 0 "Enables AArch64 architecture.")
 set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
diff --git a/build/cmake/generate_aom_config_templates.cmake b/build/cmake/generate_aom_config_templates.cmake
index c27e2f27aa..743d007ddf 100644
--- a/build/cmake/generate_aom_config_templates.cmake
+++ b/build/cmake/generate_aom_config_templates.cmake
@@ -85,8 +85,7 @@ file(APPEND "${aom_config_h_template}" "\#endif  // AOM_CONFIG_H_")
 set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
 file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
 foreach(aom_var ${aom_build_vars})
-  if(NOT "${aom_var}" STREQUAL "INLINE"
-     AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+  if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
     file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
   endif()
 endforeach()
diff --git a/test/av1_inv_txfm1d_test.cc b/test/av1_inv_txfm1d_test.cc
index 13317f9ad8..156fb4096f 100644
--- a/test/av1_inv_txfm1d_test.cc
+++ b/test/av1_inv_txfm1d_test.cc
@@ -102,7 +102,7 @@ TEST(av1_inv_txfm1d, InvAccuracyCheck) {
   }
 }
 
-static INLINE int get_max_bit(int x) {
+static inline int get_max_bit(int x) {
   int max_bit = -1;
   while (x) {
     x = x >> 1;
diff --git a/test/av1_txfm_test.h b/test/av1_txfm_test.h
index 0a78ca34aa..a7f4a2b720 100644
--- a/test/av1_txfm_test.h
+++ b/test/av1_txfm_test.h
@@ -86,7 +86,7 @@ typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE,
 static const int bd = 10;
 static const int input_base = (1 << bd);
 
-static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
+static inline bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
   TxSetType tx_set_type;
   if (tx_size_sqr_up > TX_32X32) {
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index 7604f237df..ed3216f497 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc
@@ -62,7 +62,7 @@ const nonUniformTileConfigParam nonUniformTileConfigParams[] = {
 };
 
 // Find smallest k>=0 such that (blk_size << k) >= target
-static INLINE int tile_log2(int blk_size, int target) {
+static inline int tile_log2(int blk_size, int target) {
   int k;
   for (k = 0; (blk_size << k) < target; k++) {
   }
diff --git a/test/util.h b/test/util.h
index c60e961404..5ca5ed8930 100644
--- a/test/util.h
+++ b/test/util.h
@@ -52,7 +52,7 @@ inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
   return psnr;
 }
 
-static INLINE double get_time_mark(aom_usec_timer *t) {
+static inline double get_time_mark(aom_usec_timer *t) {
   aom_usec_timer_mark(t);
   return static_cast<double>(aom_usec_timer_elapsed(t));
 }
diff --git a/third_party/SVT-AV1/EbMemory_AVX2.h b/third_party/SVT-AV1/EbMemory_AVX2.h
index 0d0ea10abc..bd0c4fb0fd 100644
--- a/third_party/SVT-AV1/EbMemory_AVX2.h
+++ b/third_party/SVT-AV1/EbMemory_AVX2.h
@@ -29,7 +29,7 @@
   _mm256_set_m128i((hi), (lo))
 #endif
 
-static INLINE __m256i load_u8_4x2_avx2(const uint8_t *const src,
+static inline __m256i load_u8_4x2_avx2(const uint8_t *const src,
                                        const ptrdiff_t stride) {
   __m128i src01;
   src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
@@ -37,7 +37,7 @@ static INLINE __m256i load_u8_4x2_avx2(const uint8_t *const src,
   return _mm256_setr_m128i(src01, _mm_setzero_si128());
 }
 
-static INLINE __m256i load_u8_4x4_avx2(const uint8_t *const src,
+static inline __m256i load_u8_4x4_avx2(const uint8_t *const src,
                                        const ptrdiff_t stride) {
   __m128i src01, src23;
   src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
@@ -47,14 +47,14 @@ static INLINE __m256i load_u8_4x4_avx2(const uint8_t *const src,
   return _mm256_setr_m128i(src01, src23);
 }
 
-static INLINE __m256i load_u8_8x2_avx2(const uint8_t *const src,
+static inline __m256i load_u8_8x2_avx2(const uint8_t *const src,
                                        const ptrdiff_t stride) {
   const __m128i src0 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
   const __m128i src1 = _mm_loadl_epi64((__m128i *)(src + 1 * stride));
   return _mm256_setr_m128i(src0, src1);
 }
 
-static INLINE __m256i load_u8_8x4_avx2(const uint8_t *const src,
+static inline __m256i load_u8_8x4_avx2(const uint8_t *const src,
                                        const ptrdiff_t stride) {
   __m128i src01, src23;
   src01 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
@@ -66,7 +66,7 @@ static INLINE __m256i load_u8_8x4_avx2(const uint8_t *const src,
   return _mm256_setr_m128i(src01, src23);
 }
 
-static INLINE __m256i loadu_8bit_16x2_avx2(const void *const src,
+static inline __m256i loadu_8bit_16x2_avx2(const void *const src,
                                            const ptrdiff_t strideInByte) {
   const __m128i src0 = _mm_loadu_si128((__m128i *)src);
   const __m128i src1 =
@@ -74,17 +74,17 @@ static INLINE __m256i loadu_8bit_16x2_avx2(const void *const src,
   return _mm256_setr_m128i(src0, src1);
 }
 
-static INLINE __m256i loadu_u8_16x2_avx2(const uint8_t *const src,
+static inline __m256i loadu_u8_16x2_avx2(const uint8_t *const src,
                                          const ptrdiff_t stride) {
   return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
 }
 
-static INLINE __m256i loadu_u16_8x2_avx2(const uint16_t *const src,
+static inline __m256i loadu_u16_8x2_avx2(const uint16_t *const src,
                                          const ptrdiff_t stride) {
   return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
 }
 
-static INLINE void storeu_8bit_16x2_avx2(const __m256i src, void *const dst,
+static inline void storeu_8bit_16x2_avx2(const __m256i src, void *const dst,
                                          const ptrdiff_t strideInByte) {
   const __m128i d0 = _mm256_castsi256_si128(src);
   const __m128i d1 = _mm256_extracti128_si256(src, 1);
@@ -92,17 +92,17 @@ static INLINE void storeu_8bit_16x2_avx2(const __m256i src, void *const dst,
   _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
 }
 
-static INLINE void storeu_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
+static inline void storeu_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
                                        const ptrdiff_t stride) {
   storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
 }
 
-static INLINE void storeu_s16_8x2_avx2(const __m256i src, int16_t *const dst,
+static inline void storeu_s16_8x2_avx2(const __m256i src, int16_t *const dst,
                                        const ptrdiff_t stride) {
   storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
 }
 
-static INLINE void storeu_u16_8x2_avx2(const __m256i src, uint16_t *const dst,
+static inline void storeu_u16_8x2_avx2(const __m256i src, uint16_t *const dst,
                                        const ptrdiff_t stride) {
   storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
 }
diff --git a/third_party/SVT-AV1/EbMemory_SSE4_1.h b/third_party/SVT-AV1/EbMemory_SSE4_1.h
index 8c51673f12..a1ab531525 100644
--- a/third_party/SVT-AV1/EbMemory_SSE4_1.h
+++ b/third_party/SVT-AV1/EbMemory_SSE4_1.h
@@ -20,18 +20,18 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/mem_sse2.h"
 
-static INLINE __m128i load8bit_4x2_sse4_1(const void *const src,
+static inline __m128i load8bit_4x2_sse4_1(const void *const src,
                                           const ptrdiff_t strideInByte) {
   const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
   return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
 }
 
-static INLINE __m128i load_u8_4x2_sse4_1(const uint8_t *const src,
+static inline __m128i load_u8_4x2_sse4_1(const uint8_t *const src,
                                          const ptrdiff_t stride) {
   return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
 }
 
-static INLINE __m128i load_u16_2x2_sse4_1(const uint16_t *const src,
+static inline __m128i load_u16_2x2_sse4_1(const uint16_t *const src,
                                           const ptrdiff_t stride) {
   return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
 }
diff --git a/third_party/SVT-AV1/convolve_avx2.h b/third_party/SVT-AV1/convolve_avx2.h
index da7e7c091f..0e7e436ec2 100644
--- a/third_party/SVT-AV1/convolve_avx2.h
+++ b/third_party/SVT-AV1/convolve_avx2.h
@@ -20,7 +20,7 @@
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/mem_sse2.h"
 
-static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
+static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
                                              __m256i coeffs[2]) {
   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
 
@@ -30,7 +30,7 @@ static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
 }
 
-static INLINE void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
+static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
                                              __m256i coeffs[3]) {
   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
 
@@ -42,7 +42,7 @@ static INLINE void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
   coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
 }
 
-static INLINE void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
+static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
                                              __m256i coeffs[4]) {
   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
 
@@ -56,7 +56,7 @@ static INLINE void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
   coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
 }
 
-static INLINE void prepare_half_coeffs_2tap_ssse3(
+static inline void prepare_half_coeffs_2tap_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [1] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -77,7 +77,7 @@ static INLINE void prepare_half_coeffs_2tap_ssse3(
   *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
 }
 
-static INLINE void prepare_half_coeffs_4tap_ssse3(
+static inline void prepare_half_coeffs_4tap_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [2] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -100,7 +100,7 @@ static INLINE void prepare_half_coeffs_4tap_ssse3(
   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
 }
 
-static INLINE void prepare_half_coeffs_6tap_ssse3(
+static inline void prepare_half_coeffs_6tap_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [3] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -125,7 +125,7 @@ static INLINE void prepare_half_coeffs_6tap_ssse3(
   coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
 }
 
-static INLINE void prepare_half_coeffs_8tap_ssse3(
+static inline void prepare_half_coeffs_8tap_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [4] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -152,7 +152,7 @@ static INLINE void prepare_half_coeffs_8tap_ssse3(
   coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
 }
 
-static INLINE void prepare_half_coeffs_2tap_avx2(
+static inline void prepare_half_coeffs_2tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [1] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -174,7 +174,7 @@ static INLINE void prepare_half_coeffs_2tap_avx2(
   *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
 }
 
-static INLINE void prepare_half_coeffs_4tap_avx2(
+static inline void prepare_half_coeffs_4tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [2] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -192,7 +192,7 @@ static INLINE void prepare_half_coeffs_4tap_avx2(
   populate_coeffs_4tap_avx2(coeffs_1, coeffs);
 }
 
-static INLINE void prepare_half_coeffs_6tap_avx2(
+static inline void prepare_half_coeffs_6tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [3] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -210,7 +210,7 @@ static INLINE void prepare_half_coeffs_6tap_avx2(
   populate_coeffs_6tap_avx2(coeffs_1, coeffs);
 }
 
-static INLINE void prepare_half_coeffs_8tap_avx2(
+static inline void prepare_half_coeffs_8tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -228,7 +228,7 @@ static INLINE void prepare_half_coeffs_8tap_avx2(
   populate_coeffs_8tap_avx2(coeffs_1, coeffs);
 }
 
-static INLINE void prepare_coeffs_2tap_sse2(
+static inline void prepare_coeffs_2tap_sse2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [1] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -240,7 +240,7 @@ static INLINE void prepare_coeffs_2tap_sse2(
   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
 }
 
-static INLINE void prepare_coeffs_4tap_sse2(
+static inline void prepare_coeffs_4tap_sse2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [2] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -254,7 +254,7 @@ static INLINE void prepare_coeffs_4tap_sse2(
   coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
 }
 
-static INLINE void prepare_coeffs_6tap_ssse3(
+static inline void prepare_coeffs_6tap_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [3] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -269,7 +269,7 @@ static INLINE void prepare_coeffs_6tap_ssse3(
   coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
 }
 
-static INLINE void prepare_coeffs_8tap_sse2(
+static inline void prepare_coeffs_8tap_sse2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -287,7 +287,7 @@ static INLINE void prepare_coeffs_8tap_sse2(
   coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
 }
 
-static INLINE void prepare_coeffs_2tap_avx2(
+static inline void prepare_coeffs_2tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [1] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -300,7 +300,7 @@ static INLINE void prepare_coeffs_2tap_avx2(
   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
 }
 
-static INLINE void prepare_coeffs_4tap_avx2(
+static inline void prepare_coeffs_4tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [2] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -315,7 +315,7 @@ static INLINE void prepare_coeffs_4tap_avx2(
   coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
 }
 
-static INLINE void prepare_coeffs_6tap_avx2(
+static inline void prepare_coeffs_6tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [3]*/) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
@@ -331,7 +331,7 @@ static INLINE void prepare_coeffs_6tap_avx2(
   coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
 }
 
-static INLINE void prepare_coeffs_8tap_avx2(
+static inline void prepare_coeffs_8tap_avx2(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
@@ -350,7 +350,7 @@ static INLINE void prepare_coeffs_8tap_avx2(
   coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
 }
 
-static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
+static inline void load_16bit_5rows_avx2(const int16_t *const src,
                                          const ptrdiff_t stride,
                                          __m256i dst[5]) {
   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
@@ -360,7 +360,7 @@ static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
 }
 
-static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
+static inline void load_16bit_7rows_avx2(const int16_t *const src,
                                          const ptrdiff_t stride,
                                          __m256i dst[7]) {
   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
@@ -419,7 +419,7 @@ static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
   tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
 }
 
-static INLINE void convolve_8tap_unpack_avx2(const __m256i s[6],
+static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
                                              __m256i ss[7]) {
   ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
   ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
@@ -429,19 +429,19 @@ static INLINE void convolve_8tap_unpack_avx2(const __m256i s[6],
   ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
 }
 
-static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
+static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
                                           const __m128i coeffs[1]) {
   return _mm_maddubs_epi16(ss[0], coeffs[0]);
 }
 
-static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
+static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
                                           const __m128i coeffs[2]) {
   const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
   const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
   return _mm_add_epi16(res_23, res_45);
 }
 
-static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
+static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
                                           const __m128i coeffs[3]) {
   const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
   const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
@@ -450,7 +450,7 @@ static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
   return _mm_add_epi16(res_1256, res_34);
 }
 
-static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
+static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
                                           const __m128i coeffs[4]) {
   const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
   const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
@@ -461,19 +461,19 @@ static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
   return _mm_add_epi16(res_0145, res_2367);
 }
 
-static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
+static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
                                          const __m256i coeffs[1]) {
   return _mm256_maddubs_epi16(ss[0], coeffs[0]);
 }
 
-static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
+static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
                                          const __m256i coeffs[2]) {
   const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
   const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
   return _mm256_add_epi16(res_23, res_45);
 }
 
-static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
+static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
                                          const __m256i coeffs[3]) {
   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
@@ -482,7 +482,7 @@ static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
   return _mm256_add_epi16(res_0145, res_23);
 }
 
-static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
+static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
                                          const __m256i coeffs[4]) {
   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
@@ -493,19 +493,19 @@ static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
   return _mm256_add_epi16(res_0145, res_2367);
 }
 
-static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
+static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
                                            const __m128i coeffs[1]) {
   return _mm_madd_epi16(ss[0], coeffs[0]);
 }
 
-static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
+static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
                                            const __m128i coeffs[2]) {
   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
   return _mm_add_epi32(res_01, res_23);
 }
 
-static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
+static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
                                            const __m128i coeffs[3]) {
   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
@@ -514,7 +514,7 @@ static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
   return _mm_add_epi32(res_0123, res_45);
 }
 
-static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
+static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
                                            const __m128i coeffs[4]) {
   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
@@ -525,19 +525,19 @@ static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
   return _mm_add_epi32(res_0123, res_4567);
 }
 
-static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
+static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
                                            const __m256i coeffs[1]) {
   return _mm256_madd_epi16(ss[0], coeffs[0]);
 }
 
-static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
+static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
                                            const __m256i coeffs[2]) {
   const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
   const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
   return _mm256_add_epi32(res_1, res_2);
 }
 
-static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
+static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
                                            const __m256i coeffs[3]) {
   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
@@ -546,7 +546,7 @@ static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
   return _mm256_add_epi32(res_0123, res_45);
 }
 
-static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
+static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
                                            const __m256i coeffs[4]) {
   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
@@ -557,7 +557,7 @@ static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
   return _mm256_add_epi32(res_0123, res_4567);
 }
 
-static INLINE __m256i x_convolve_4tap_avx2(const __m256i data,
+static inline __m256i x_convolve_4tap_avx2(const __m256i data,
                                            const __m256i coeffs[2],
                                            const __m256i filt[2]) {
   __m256i ss[2];
@@ -568,7 +568,7 @@ static INLINE __m256i x_convolve_4tap_avx2(const __m256i data,
   return convolve_4tap_avx2(ss, coeffs);
 }
 
-static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
+static inline __m256i x_convolve_6tap_avx2(const __m256i data,
                                            const __m256i coeffs[3],
                                            const __m256i filt[3]) {
   __m256i ss[3];
@@ -580,7 +580,7 @@ static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
   return convolve_6tap_avx2(ss, coeffs);
 }
 
-static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
+static inline __m256i x_convolve_8tap_avx2(const __m256i data,
                                            const __m256i coeffs[4],
                                            const __m256i filt[4]) {
   __m256i ss[4];
@@ -593,37 +593,37 @@ static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
   return convolve_8tap_avx2(ss, coeffs);
 }
 
-static INLINE __m256i sr_y_round_avx2(const __m256i src) {
+static inline __m256i sr_y_round_avx2(const __m256i src) {
   const __m256i round = _mm256_set1_epi16(32);
   const __m256i dst = _mm256_add_epi16(src, round);
   return _mm256_srai_epi16(dst, FILTER_BITS - 1);
 }
 
-static INLINE __m128i xy_x_round_sse2(const __m128i src) {
+static inline __m128i xy_x_round_sse2(const __m128i src) {
   const __m128i round = _mm_set1_epi16(2);
   const __m128i dst = _mm_add_epi16(src, round);
   return _mm_srai_epi16(dst, 2);
 }
 
-static INLINE __m256i xy_x_round_avx2(const __m256i src) {
+static inline __m256i xy_x_round_avx2(const __m256i src) {
   const __m256i round = _mm256_set1_epi16(2);
   const __m256i dst = _mm256_add_epi16(src, round);
   return _mm256_srai_epi16(dst, 2);
 }
 
-static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
+static inline void xy_x_round_store_2x2_sse2(const __m128i res,
                                              int16_t *const dst) {
   const __m128i d = xy_x_round_sse2(res);
   _mm_storel_epi64((__m128i *)dst, d);
 }
 
-static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
+static inline void xy_x_round_store_4x2_sse2(const __m128i res,
                                              int16_t *const dst) {
   const __m128i d = xy_x_round_sse2(res);
   _mm_storeu_si128((__m128i *)dst, d);
 }
 
-static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
+static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
                                              int16_t *const dst) {
   __m128i r[2];
 
@@ -633,13 +633,13 @@ static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
   _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
 }
 
-static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
+static inline void xy_x_round_store_8x2_avx2(const __m256i res,
                                              int16_t *const dst) {
   const __m256i d = xy_x_round_avx2(res);
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
+static inline void xy_x_round_store_32_avx2(const __m256i res[2],
                                             int16_t *const dst) {
   __m256i r[2];
 
@@ -653,50 +653,50 @@ static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 }
 
-static INLINE __m128i xy_y_round_sse2(const __m128i src) {
+static inline __m128i xy_y_round_sse2(const __m128i src) {
   const __m128i round = _mm_set1_epi32(1024);
   const __m128i dst = _mm_add_epi32(src, round);
   return _mm_srai_epi32(dst, 11);
 }
 
-static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
+static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
   const __m128i round = _mm_set1_epi16(16);
   const __m128i dst = _mm_add_epi16(src, round);
   return _mm_srai_epi16(dst, 5);
 }
 
-static INLINE __m256i xy_y_round_avx2(const __m256i src) {
+static inline __m256i xy_y_round_avx2(const __m256i src) {
   const __m256i round = _mm256_set1_epi32(1024);
   const __m256i dst = _mm256_add_epi32(src, round);
   return _mm256_srai_epi32(dst, 11);
 }
 
-static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
+static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
   const __m256i r0 = xy_y_round_avx2(r[0]);
   const __m256i r1 = xy_y_round_avx2(r[1]);
   return _mm256_packs_epi32(r0, r1);
 }
 
-static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
+static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
   const __m256i round = _mm256_set1_epi16(16);
   const __m256i dst = _mm256_add_epi16(src, round);
   return _mm256_srai_epi16(dst, 5);
 }
 
-static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
+static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
                                        const ptrdiff_t stride) {
   const __m128i d = _mm_packus_epi16(res, res);
   *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
   *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
 }
 
-static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
+static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
                                        const ptrdiff_t stride) {
   const __m128i d = _mm_packus_epi16(res, res);
   store_u8_4x2_sse2(d, dst, stride);
 }
 
-static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
+static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
                                        const ptrdiff_t stride) {
   const __m256i d = _mm256_packus_epi16(res, res);
   const __m128i d0 = _mm256_castsi256_si128(d);
@@ -706,7 +706,7 @@ static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
   xx_storel_32(dst + stride, d1);
 }
 
-static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
+static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
                                        const ptrdiff_t stride) {
   const __m256i d = _mm256_packus_epi16(res, res);
   const __m128i d0 = _mm256_castsi256_si128(d);
@@ -715,14 +715,14 @@ static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
   _mm_storel_epi64((__m128i *)(dst + stride), d1);
 }
 
-static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
+static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
                                         uint8_t *const dst,
                                         const ptrdiff_t stride) {
   const __m256i d = _mm256_packus_epi16(res0, res1);
   storeu_u8_16x2_avx2(d, dst, stride);
 }
 
-static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
+static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
                                              const __m256i res1,
                                              uint8_t *const dst,
                                              const ptrdiff_t stride) {
@@ -731,14 +731,14 @@ static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
   storeu_u8_16x2_avx2(d, dst, stride);
 }
 
-static INLINE void pack_store_32_avx2(const __m256i res0, const __m256i res1,
+static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
                                       uint8_t *const dst) {
   const __m256i t = _mm256_packus_epi16(res0, res1);
   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
+static inline void xy_y_round_store_2x2_sse2(const __m128i res,
                                              uint8_t *const dst,
                                              const ptrdiff_t stride) {
   const __m128i r = xy_y_round_sse2(res);
@@ -746,7 +746,7 @@ static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
   pack_store_2x2_sse2(rr, dst, stride);
 }
 
-static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
+static inline void xy_y_round_store_4x2_avx2(const __m256i res,
                                              uint8_t *const dst,
                                              const ptrdiff_t stride) {
   const __m256i r = xy_y_round_avx2(res);
@@ -754,7 +754,7 @@ static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
   pack_store_4x2_avx2(rr, dst, stride);
 }
 
-static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
+static inline void xy_y_pack_store_32_avx2(const __m256i res0,
                                            const __m256i res1,
                                            uint8_t *const dst) {
   const __m256i d = _mm256_packus_epi16(res0, res1);
@@ -762,7 +762,7 @@ static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
+static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
                                             const __m256i r1[2],
                                             uint8_t *const dst) {
   const __m256i ra = xy_y_round_16_avx2(r0);
@@ -770,39 +770,39 @@ static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
   xy_y_pack_store_32_avx2(ra, rb, dst);
 }
 
-static INLINE void convolve_store_32_avx2(const __m256i res0,
+static inline void convolve_store_32_avx2(const __m256i res0,
                                           const __m256i res1,
                                           uint8_t *const dst) {
   const __m256i d = _mm256_packus_epi16(res0, res1);
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE __m128i sr_x_round_sse2(const __m128i src) {
+static inline __m128i sr_x_round_sse2(const __m128i src) {
   const __m128i round = _mm_set1_epi16(34);
   const __m128i dst = _mm_add_epi16(src, round);
   return _mm_srai_epi16(dst, 6);
 }
 
-static INLINE __m256i sr_x_round_avx2(const __m256i src) {
+static inline __m256i sr_x_round_avx2(const __m256i src) {
   const __m256i round = _mm256_set1_epi16(34);
   const __m256i dst = _mm256_add_epi16(src, round);
   return _mm256_srai_epi16(dst, 6);
 }
 
-static INLINE __m128i sr_y_round_sse2(const __m128i src) {
+static inline __m128i sr_y_round_sse2(const __m128i src) {
   const __m128i round = _mm_set1_epi16(32);
   const __m128i dst = _mm_add_epi16(src, round);
   return _mm_srai_epi16(dst, FILTER_BITS - 1);
 }
 
-static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
+static inline void sr_x_round_store_8x2_avx2(const __m256i res,
                                              uint8_t *const dst,
                                              const ptrdiff_t dst_stride) {
   const __m256i r = sr_x_round_avx2(res);
   pack_store_8x2_avx2(r, dst, dst_stride);
 }
 
-static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
+static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
                                               uint8_t *const dst,
                                               const ptrdiff_t dst_stride) {
   __m256i r[2];
@@ -812,7 +812,7 @@ static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
 }
 
-static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
+static inline void sr_x_round_store_32_avx2(const __m256i res[2],
                                             uint8_t *const dst) {
   __m256i r[2];
 
@@ -821,14 +821,14 @@ static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
   convolve_store_32_avx2(r[0], r[1], dst);
 }
 
-static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
+static inline void sr_y_round_store_8x2_avx2(const __m256i res,
                                              uint8_t *const dst,
                                              const ptrdiff_t dst_stride) {
   const __m256i r = sr_y_round_avx2(res);
   pack_store_8x2_avx2(r, dst, dst_stride);
 }
 
-static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
+static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
                                               uint8_t *const dst,
                                               const ptrdiff_t dst_stride) {
   __m256i r[2];
@@ -838,7 +838,7 @@ static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
 }
 
-static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
+static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
                                          const __m256i s0, __m256i *const s1,
                                          uint8_t *const dst) {
   *s1 = _mm256_loadu_si256((__m256i *)src);
@@ -846,7 +846,7 @@ static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
+static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
                                          uint8_t *const dst) {
   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
@@ -854,7 +854,7 @@ static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)dst, d);
 }
 
-static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
+static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
                                                  const ptrdiff_t stride,
                                                  const __m128i coeffs[1]) {
   const __m128i sfl =
@@ -864,7 +864,7 @@ static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
   return convolve_2tap_ssse3(&ss, coeffs);
 }
 
-static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[1]) {
   const __m128i sfl =
@@ -874,7 +874,7 @@ static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
   return convolve_2tap_ssse3(&ss, coeffs);
 }
 
-static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
+static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m128i coeffs[1],
                                              __m128i r[2]) {
@@ -890,7 +890,7 @@ static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
   r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
 }
 
-static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[1]) {
   __m128i s_128[2][2];
@@ -906,7 +906,7 @@ static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
   return convolve_2tap_avx2(&ss, coeffs);
 }
 
-static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
+static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m256i coeffs[1],
                                              __m256i r[2]) {
@@ -918,7 +918,7 @@ static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
   r[1] = convolve_2tap_avx2(&s1, coeffs);
 }
 
-static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
+static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
                                            const __m256i coeffs[1],
                                            __m256i r[2]) {
   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
@@ -930,7 +930,7 @@ static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
   r[1] = convolve_2tap_avx2(&ss1, coeffs);
 }
 
-static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[2]) {
   const __m128i sfl0 =
@@ -945,7 +945,7 @@ static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
   return convolve_4tap_ssse3(ss, coeffs);
 }
 
-static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[2]) {
   const __m128i s = load_u8_8x2_sse2(src, stride);
@@ -960,7 +960,7 @@ static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
   return convolve_4tap_ssse3(ss, coeffs);
 }
 
-static INLINE __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[2],
                                                const __m256i filt[2]) {
@@ -968,7 +968,7 @@ static INLINE __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
   return x_convolve_4tap_avx2(s_256, coeffs, filt);
 }
 
-static INLINE void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
+static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
                                              const int32_t src_stride,
                                              const __m256i coeffs[2],
                                              const __m256i filt[2],
@@ -977,7 +977,7 @@ static INLINE void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
   r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 }
 
-static INLINE void x_convolve_4tap_32_avx2(const uint8_t *const src,
+static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
                                            const __m256i coeffs[2],
                                            const __m256i filt[2],
                                            __m256i r[2]) {
@@ -988,7 +988,7 @@ static INLINE void x_convolve_4tap_32_avx2(const uint8_t *const src,
   r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
 }
 
-static INLINE __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[3]) {
   const __m128i sfl0 =
@@ -1007,7 +1007,7 @@ static INLINE __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
   return convolve_6tap_ssse3(ss, coeffs);
 }
 
-static INLINE __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[3]) {
   const __m128i s = load_u8_8x2_sse2(src, stride);
@@ -1025,7 +1025,7 @@ static INLINE __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
   return convolve_6tap_ssse3(ss, coeffs);
 }
 
-static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[3],
                                                const __m256i filt[3]) {
@@ -1033,7 +1033,7 @@ static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
   return x_convolve_6tap_avx2(s_256, coeffs, filt);
 }
 
-static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
+static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
                                              const int32_t src_stride,
                                              const __m256i coeffs[3],
                                              const __m256i filt[3],
@@ -1042,7 +1042,7 @@ static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
   r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 }
 
-static INLINE void x_convolve_6tap_32_avx2(const uint8_t *const src,
+static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
                                            const __m256i coeffs[3],
                                            const __m256i filt[3],
                                            __m256i r[2]) {
@@ -1053,7 +1053,7 @@ static INLINE void x_convolve_6tap_32_avx2(const uint8_t *const src,
   r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
 }
 
-static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[4],
                                                const __m256i filt[4]) {
@@ -1081,7 +1081,7 @@ static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
   r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
 }
 
-static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[1],
                                                 __m128i s_16[2]) {
@@ -1095,7 +1095,7 @@ static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
   return convolve_2tap_ssse3(&ss, coeffs);
 }
 
-static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[1],
                                                 __m128i s_32[2]) {
@@ -1109,7 +1109,7 @@ static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
   return convolve_2tap_ssse3(&ss, coeffs);
 }
 
-static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[1],
                                                __m128i s_64[2]) {
@@ -1123,7 +1123,7 @@ static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
   return convolve_2tap_avx2(&ss, coeffs);
 }
 
-static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
+static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m256i coeffs[1],
                                              __m128i s_128[2], __m256i r[2]) {
@@ -1139,7 +1139,7 @@ static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
   r[1] = convolve_2tap_avx2(&ss1, coeffs);
 }
 
-static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
+static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
                                            const __m256i coeffs[1],
                                            const __m256i s0, __m256i *const s1,
                                            __m256i r[2]) {
@@ -1150,7 +1150,7 @@ static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
   r[1] = convolve_2tap_avx2(&ss1, coeffs);
 }
 
-static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[2],
                                                 __m128i s_16[4],
@@ -1163,7 +1163,7 @@ static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
   return convolve_4tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[2],
                                                 __m128i s_32[4],
@@ -1176,7 +1176,7 @@ static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
   return convolve_4tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[2],
                                                __m128i s_64[4],
@@ -1189,7 +1189,7 @@ static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
   return convolve_4tap_avx2(ss_256, coeffs);
 }
 
-static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
+static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m256i coeffs[2],
                                              __m128i s_128[4],
@@ -1204,7 +1204,7 @@ static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
   r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
 }
 
-static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[3],
                                                 __m128i s_16[6],
@@ -1217,7 +1217,7 @@ static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
   return convolve_6tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE void y_convolve_4tap_32x2_avx2(
+static inline void y_convolve_4tap_32x2_avx2(
     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
     __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
@@ -1232,7 +1232,7 @@ static INLINE void y_convolve_4tap_32x2_avx2(
   r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
 }
 
-static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[3],
                                                 __m128i s_32[6],
@@ -1245,7 +1245,7 @@ static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
   return convolve_6tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[3],
                                                __m128i s_64[6],
@@ -1258,7 +1258,7 @@ static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
   return convolve_6tap_avx2(ss_256, coeffs);
 }
 
-static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
+static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m256i coeffs[3],
                                              __m128i s_128[6],
@@ -1273,7 +1273,7 @@ static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
   r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
 }
 
-static INLINE void y_convolve_6tap_32x2_avx2(
+static inline void y_convolve_6tap_32x2_avx2(
     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
     __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
@@ -1288,7 +1288,7 @@ static INLINE void y_convolve_6tap_32x2_avx2(
   r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
 }
 
-static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[4],
                                                 __m128i s_16[8],
@@ -1301,7 +1301,7 @@ static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
   return convolve_8tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
+static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
                                                 const ptrdiff_t stride,
                                                 const __m128i coeffs[4],
                                                 __m128i s_32[8],
@@ -1314,7 +1314,7 @@ static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
   return convolve_8tap_ssse3(ss_128, coeffs);
 }
 
-static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
+static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
                                                const ptrdiff_t stride,
                                                const __m256i coeffs[4],
                                                __m128i s_64[8],
@@ -1327,7 +1327,7 @@ static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
   return convolve_8tap_avx2(ss_256, coeffs);
 }
 
-static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
+static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
                                              const ptrdiff_t stride,
                                              const __m256i coeffs[4],
                                              __m128i s_128[8],
@@ -1342,7 +1342,7 @@ static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
   r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
 }
 
-static INLINE void y_convolve_8tap_32x2_avx2(
+static inline void y_convolve_8tap_32x2_avx2(
     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
@@ -1357,7 +1357,7 @@ static INLINE void y_convolve_8tap_32x2_avx2(
   r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
 }
 
-static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
+static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
                                               const __m256i coeffs[1],
                                               __m256i r[2]) {
   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
@@ -1369,7 +1369,7 @@ static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
   r[1] = convolve_2tap_avx2(&ss1, coeffs);
 }
 
-static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
+static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[1],
                                      int16_t *const dst) {
   __m256i r[2];
@@ -1381,7 +1381,7 @@ static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 }
 
-static INLINE void xy_x_4tap_32_avx2(const uint8_t *const src,
+static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[2],
                                      const __m256i filt[2],
                                      int16_t *const dst) {
@@ -1394,7 +1394,7 @@ static INLINE void xy_x_4tap_32_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 }
 
-static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
+static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[3],
                                      const __m256i filt[3],
                                      int16_t *const dst) {
@@ -1407,7 +1407,7 @@ static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 }
 
-static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
+static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[4],
                                      const __m256i filt[4],
                                      int16_t *const dst) {
@@ -1420,7 +1420,7 @@ static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 }
 
-static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
+static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[2],
                                                   const __m128i coeffs[1]) {
   __m128i s_128[2];
@@ -1433,7 +1433,7 @@ static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
   return convolve16_2tap_sse2(&ss, coeffs);
 }
 
-static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
+static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
     const int16_t *const src, __m128i s_32[2]) {
   __m128i s_128[2];
 
@@ -1444,7 +1444,7 @@ static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
   return _mm_add_epi16(s_128[0], s_128[1]);
 }
 
-static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
                                                __m128i s_64[2],
                                                const __m128i coeffs[1],
                                                __m128i r[2]) {
@@ -1460,7 +1460,7 @@ static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
   r[1] = convolve16_2tap_sse2(&ss1, coeffs);
 }
 
-static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
+static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
     const int16_t *const src, __m128i s_64[2]) {
   __m128i s_128[2];
 
@@ -1471,7 +1471,7 @@ static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
   return _mm_add_epi16(s_128[0], s_128[1]);
 }
 
-static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
+static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
                                               const __m256i s1,
                                               const __m256i coeffs[1],
                                               __m256i r[2]) {
@@ -1481,7 +1481,7 @@ static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
   r[1] = convolve16_2tap_avx2(&ss1, coeffs);
 }
 
-static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
                                                __m128i s_128[2],
                                                const __m256i coeffs[1],
                                                __m256i r[2]) {
@@ -1493,7 +1493,7 @@ static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
   xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
 }
 
-static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
+static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
     const int16_t *const src, __m128i s_128[2]) {
   __m256i s_256[2];
   s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
@@ -1503,7 +1503,7 @@ static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
   return _mm256_add_epi16(s_256[0], s_256[1]);
 }
 
-static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
+static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
     const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
   r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
@@ -1511,14 +1511,14 @@ static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
   r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
 }
 
-static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
+static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
                                         const ptrdiff_t stride) {
   const __m256i t = _mm256_packus_epi16(r[0], r[1]);
   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
   storeu_u8_16x2_avx2(d, dst, stride);
 }
 
-static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
                                                 __m256i s[2],
                                                 const __m256i coeffs[1],
                                                 __m256i r[4]) {
@@ -1528,7 +1528,7 @@ static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
   xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
 }
 
-static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
                                               const __m256i s0[2],
                                               __m256i s1[2],
                                               const __m256i coeffs[1],
@@ -1539,7 +1539,7 @@ static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
   xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
 }
 
-static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
                                                   const __m256i s0[2],
                                                   __m256i s1[2],
                                                   const __m256i coeffs[1],
@@ -1550,7 +1550,7 @@ static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
   xy_y_round_store_32_avx2(r + 0, r + 2, dst);
 }
 
-static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
+static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
                                                        const __m256i s0[2],
                                                        __m256i s1[2],
                                                        __m256i r[2]) {
@@ -1560,7 +1560,7 @@ static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
   r[1] = _mm256_add_epi16(s0[1], s1[1]);
 }
 
-static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
+static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
     const int16_t *const src, const __m256i s0[2], __m256i s1[2],
     uint8_t *const dst) {
   __m256i r[2];
@@ -1571,7 +1571,7 @@ static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
   xy_y_pack_store_32_avx2(r[0], r[1], dst);
 }
 
-static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
+static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[4],
                                                   __m128i ss_128[2],
                                                   const __m128i coeffs[2]) {
@@ -1585,7 +1585,7 @@ static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
   return r;
 }
 
-static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
+static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
                                                   __m128i s_64[4],
                                                   __m256i ss_256[2],
                                                   const __m256i coeffs[2]) {
@@ -1600,14 +1600,14 @@ static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
   return r;
 }
 
-static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
+static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
                                               const __m256i coeffs[2],
                                               __m256i r[2]) {
   r[0] = convolve16_4tap_avx2(ss, coeffs);
   r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
 }
 
-static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
+static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
                                                __m256i ss_256[4],
                                                const __m256i coeffs[2],
                                                __m256i r[2]) {
@@ -1621,7 +1621,7 @@ static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
   ss_256[2] = ss_256[3];
 }
 
-static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
+static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
     const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
     __m256i r[2]) {
   __m256i a_256[2];
@@ -1634,7 +1634,7 @@ static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
   s_256[1] = s_256[3];
 }
 
-static INLINE void xy_y_convolve_4tap_16x2_avx2(
+static inline void xy_y_convolve_4tap_16x2_avx2(
     const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
     __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
@@ -1651,7 +1651,7 @@ static INLINE void xy_y_convolve_4tap_16x2_avx2(
   tt_256[2] = tt_256[3];
 }
 
-static INLINE void xy_y_convolve_4tap_32x2_avx2(
+static inline void xy_y_convolve_4tap_32x2_avx2(
     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
     __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
     __m256i r[4]) {
@@ -1669,7 +1669,7 @@ static INLINE void xy_y_convolve_4tap_32x2_avx2(
   tt_256[2] = tt_256[3];
 }
 
-static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
+static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
     const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
     __m256i r[4]) {
   __m256i a_256[2];
@@ -1690,7 +1690,7 @@ static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
   s_256[2] = s_256[4];
 }
 
-static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
+static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[6],
                                                   __m128i ss_128[3],
                                                   const __m128i coeffs[3]) {
@@ -1705,7 +1705,7 @@ static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
   return r;
 }
 
-static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
+static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
                                                   __m128i s_64[6],
                                                   __m256i ss_256[3],
                                                   const __m256i coeffs[3]) {
@@ -1721,14 +1721,14 @@ static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
   return r;
 }
 
-static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
+static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
                                               const __m256i coeffs[3],
                                               __m256i r[2]) {
   r[0] = convolve16_6tap_avx2(ss, coeffs);
   r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
 }
 
-static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
+static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
                                                __m256i ss_256[6],
                                                const __m256i coeffs[3],
                                                __m256i r[2]) {
@@ -1744,7 +1744,7 @@ static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
   ss_256[4] = ss_256[5];
 }
 
-static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
+static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
     __m256i r[2]) {
   __m256i a_256[2], ss_256[4];
@@ -1763,7 +1763,7 @@ static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
   s_256[3] = s_256[5];
 }
 
-static INLINE void xy_y_convolve_6tap_16x2_avx2(
+static inline void xy_y_convolve_6tap_16x2_avx2(
     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
     __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
     __m256i r[4]) {
@@ -1788,7 +1788,7 @@ static INLINE void xy_y_convolve_6tap_16x2_avx2(
   tt_256[4] = tt_256[5];
 }
 
-static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
+static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
     __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
   __m256i a_256[2];
@@ -1816,7 +1816,7 @@ static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 }
 
-static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
+static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
                                                   __m128i s_32[8],
                                                   __m128i ss_128[4],
                                                   const __m128i coeffs[4]) {
@@ -1832,7 +1832,7 @@ static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
   return r;
 }
 
-static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
+static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
                                                   __m128i s_64[8],
                                                   __m256i ss_256[4],
                                                   const __m256i coeffs[4]) {
@@ -1849,14 +1849,14 @@ static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
   return r;
 }
 
-static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
+static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
                                               const __m256i coeffs[4],
                                               __m256i r[2]) {
   r[0] = convolve16_8tap_avx2(ss, coeffs);
   r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
 }
 
-static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
+static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
                                                __m256i ss_256[8],
                                                const __m256i coeffs[4],
                                                __m256i r[2]) {
@@ -1874,7 +1874,7 @@ static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
   ss_256[6] = ss_256[7];
 }
 
-static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
+static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
     __m256i r[2]) {
   __m256i a_256[4], ss_256[4];
@@ -1926,7 +1926,7 @@ static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
   tt_256[6] = tt_256[7];
 }
 
-static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
+static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
     const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
     __m256i s_256[8], __m256i r[4]) {
   __m256i a_256[4], ss_256[4];
@@ -1963,14 +1963,14 @@ static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 }
 
-static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
+static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
                                              uint8_t *const dst,
                                              const ptrdiff_t stride) {
   const __m256i r = xy_y_round_16_avx2(res);
   pack_store_8x2_avx2(r, dst, stride);
 }
 
-static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
+static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
                                               uint8_t *const dst,
                                               const ptrdiff_t stride) {
   const __m256i r0 = xy_y_round_16_avx2(res + 0);
@@ -1978,7 +1978,7 @@ static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
   xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
 }
 
-static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
+static inline void sr_y_round_store_32_avx2(const __m256i res[2],
                                             uint8_t *const dst) {
   __m256i r[2];
 
@@ -1987,14 +1987,14 @@ static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
   convolve_store_32_avx2(r[0], r[1], dst);
 }
 
-static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[4],
+static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
                                               uint8_t *const dst,
                                               const int32_t dst_stride) {
   sr_y_round_store_32_avx2(res, dst);
   sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
 }
 
-static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
+static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[1], const __m256i s0,
                                      __m256i *const s1, uint8_t *const dst) {
   __m256i r[2];
@@ -2905,7 +2905,7 @@ static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
   }
 }
 
-static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
+static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[1],
                                      uint8_t *const dst) {
   __m256i r[2];
@@ -2914,7 +2914,7 @@ static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
   sr_x_round_store_32_avx2(r, dst);
 }
 
-static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
+static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
                                      const __m256i coeffs[3],
                                      const __m256i filt[3],
                                      uint8_t *const dst) {
diff --git a/third_party/SVT-AV1/synonyms.h b/third_party/SVT-AV1/synonyms.h
index 0ded6e5cfc..736be4df7d 100644
--- a/third_party/SVT-AV1/synonyms.h
+++ b/third_party/SVT-AV1/synonyms.h
@@ -15,7 +15,7 @@
 #include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE __m128i load_u8_8x2_sse2(const uint8_t *const src,
+static inline __m128i load_u8_8x2_sse2(const uint8_t *const src,
                                        const ptrdiff_t stride) {
   return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
 }
diff --git a/tools/auto_refactor/av1_preprocess.py b/tools/auto_refactor/av1_preprocess.py
index 5559c1a489..0bcf7f6106 100644
--- a/tools/auto_refactor/av1_preprocess.py
+++ b/tools/auto_refactor/av1_preprocess.py
@@ -66,8 +66,6 @@ def get_av1_pp_command(fake_header_dir, code_file_list):
                                                                          " "
                                                                          "-D'AV1_K_MEANS_DIM=2'"
                                                                          " "
-                                                                         "-D'INLINE='"
-                                                                         " "
                                                                          "-D'AOM_INLINE='"
                                                                          " "
                                                                          "-D'AOM_FORCE_INLINE='"
-- 
GitLab


From 6e120cd323dd2488127606a9413806dc5e927e57 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 20:14:43 -0700
Subject: [PATCH 334/391] Change "inline static" to "static inline"

Bug: aomedia:358402891
Change-Id: I2d58d38998c8e6eaaefcd0b619a221317eeed3c5
---
 av1/encoder/arm/rdopt_neon.c | 2 +-
 av1/encoder/x86/rdopt_avx2.c | 2 +-
 av1/encoder/x86/rdopt_sse4.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/av1/encoder/arm/rdopt_neon.c b/av1/encoder/arm/rdopt_neon.c
index 5199b2e7c8..237878b091 100644
--- a/av1/encoder/arm/rdopt_neon.c
+++ b/av1/encoder/arm/rdopt_neon.c
@@ -21,7 +21,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-inline static void horver_correlation_4x4(const int16_t *diff, int stride,
+static inline void horver_correlation_4x4(const int16_t *diff, int stride,
                                           int32x4_t *xy_sum_32,
                                           int32x4_t *xz_sum_32,
                                           int32x4_t *x_sum_32,
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index d604fde009..71dd0ea16a 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -21,7 +21,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-inline static void horver_correlation_4x4(const int16_t *diff, int stride,
+static inline void horver_correlation_4x4(const int16_t *diff, int stride,
                                           __m256i *xy_sum_32,
                                           __m256i *xz_sum_32, __m256i *x_sum_32,
                                           __m256i *x2_sum_32) {
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 20877b5649..a65b5b1884 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -20,7 +20,7 @@
 // We actually use the 4x4 pixels to calculate correlations corresponding to
 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
 // moving the window along/down by 3 pixels at a time.
-inline static void horver_correlation_4x4(const int16_t *diff, int stride,
+static inline void horver_correlation_4x4(const int16_t *diff, int stride,
                                           __m128i *xy_sum_32,
                                           __m128i *xz_sum_32, __m128i *x_sum_32,
                                           __m128i *x2_sum_32) {
-- 
GitLab


From 9998ff677d1d296cebdfef77db2caadda89bbe76 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Sat, 10 Aug 2024 20:09:36 -0700
Subject: [PATCH 335/391] rtc: Only allow estimate_scroll_motion for 8 bit
 depth

This function needs to be fixed to support 10/12 bit.

Change-Id: Ib3c8550053668e006052d3cd18aec04833a4daf9
---
 av1/encoder/ratectrl.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 7e2b029d4c..8135da8cde 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3328,18 +3328,22 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
       cpi->rc.high_motion_content_screen_rtc = 1;
       // Compute fast coarse/global motion for 128x128 superblock centered
       // at middle of frames, to determine if motion is scroll.
-      int pos_col = (unscaled_src->y_width >> 1) - 64;
-      int pos_row = (unscaled_src->y_height >> 1) - 64;
-      src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col;
-      last_src_y =
-          unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col;
-      int best_intmv_col = 0;
-      int best_intmv_row = 0;
-      unsigned int y_sad = estimate_scroll_motion(
-          cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128,
-          pos_col, pos_row, &best_intmv_col, &best_intmv_row);
-      if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
-        cpi->rc.high_motion_content_screen_rtc = 0;
+      // TODO(marpan): Only allow for 8 bit-depth for now.
+      if (cm->seq_params->bit_depth == 8) {
+        int pos_col = (unscaled_src->y_width >> 1) - 64;
+        int pos_row = (unscaled_src->y_height >> 1) - 64;
+        src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col;
+        last_src_y =
+            unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col;
+        int best_intmv_col = 0;
+        int best_intmv_row = 0;
+        unsigned int y_sad = estimate_scroll_motion(
+            cpi, src_y, last_src_y, src_ystride, last_src_ystride,
+            BLOCK_128X128, pos_col, pos_row, &best_intmv_col, &best_intmv_row);
+        if (y_sad < 100 &&
+            (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
+          cpi->rc.high_motion_content_screen_rtc = 0;
+      }
     }
     // Pass the flag value to all layer frames.
     if (cpi->svc.number_spatial_layers > 1 ||
-- 
GitLab


From c6e26e14bc57cae5038431f07fe1b473204f41fa Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 14 Jun 2024 16:56:11 -0700
Subject: [PATCH 336/391] cmake: check for -Wmissing-prototypes support

Bug: aomedia:3416
Change-Id: I24b1c2253865a79db9a2fe1585d4c425cddf639d
---
 CMakeLists.txt                  | 5 +++++
 build/cmake/aom_configure.cmake | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1aa87f27fb..05dcd57d64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -472,6 +472,11 @@ if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI)
   add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
   if(NOT MSVC)
     target_compile_options(yuv PRIVATE -Wno-shadow)
+    # Many functions in libyuv trigger this warning when enabled with clang.
+    is_flag_present(AOM_CXX_FLAGS "-Wmissing-prototypes" flag_present)
+    if(flag_present)
+      target_compile_options(yuv PRIVATE -Wno-missing-prototypes)
+    endif()
   endif()
   include_directories("${AOM_ROOT}/third_party/libyuv/include")
 endif()
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 2382974d7b..44fe8b7579 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -343,6 +343,11 @@ else()
   add_compiler_flag_if_supported("-Wformat=2")
   add_c_flag_if_supported("-Wimplicit-function-declaration")
   add_compiler_flag_if_supported("-Wlogical-op")
+  if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    add_compiler_flag_if_supported("-Wmissing-prototypes")
+  else()
+    add_c_flag_if_supported("-Wmissing-prototypes")
+  endif()
   add_compiler_flag_if_supported("-Wpointer-arith")
   add_compiler_flag_if_supported("-Wshadow")
   add_compiler_flag_if_supported("-Wshorten-64-to-32")
-- 
GitLab


From d6525766c6972bb165b2b4585555af12e4eceb84 Mon Sep 17 00:00:00 2001
From: clang-format <noreply@google.com>
Date: Tue, 6 Aug 2024 18:52:58 -0700
Subject: [PATCH 337/391] apply clang-format-14

Bug: b:328632178
Change-Id: Ia5fb4d0a29a94232963d4d28cf1c83f4cd739b47
---
 av1/decoder/decodetxb.c      | 5 +++--
 av1/encoder/hash.c           | 6 +++---
 av1/encoder/pass2_strategy.c | 4 ++--
 test/simd_impl.h             | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index 465af7e06b..eb3eb16cb6 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -303,8 +303,9 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb,
       tran_low_t dq_coeff;
       // Bitmasking to clamp dq_coeff to valid range:
       //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
-      dq_coeff = (tran_low_t)(
-          (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+      dq_coeff =
+          (tran_low_t)((int64_t)level * get_dqv(dequant, scan[c], iqmatrix) &
+                       0xffffff);
       dq_coeff = dq_coeff >> shift;
       if (sign) {
         dq_coeff = -dq_coeff;
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 7e3384e317..f6a230d2da 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -15,9 +15,9 @@
 static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
                                         uint8_t *pData, uint32_t dataLength) {
   for (uint32_t i = 0; i < dataLength; i++) {
-    const uint8_t index = (uint8_t)(
-        (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
-        pData[i]);
+    const uint8_t index = (uint8_t)((p_crc_calculator->remainder >>
+                                     (p_crc_calculator->bits - 8)) ^
+                                    pData[i]);
     p_crc_calculator->remainder <<= 8;
     p_crc_calculator->remainder ^= p_crc_calculator->table[index];
   }
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 1a259c3e4d..c9a766b35a 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -3039,8 +3039,8 @@ static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
       double vbr_corpus_complexity_lap =
           cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
       /* Get the average corpus complexity of the frame */
-      kf_group_bits = (int64_t)(
-          kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+      kf_group_bits = (int64_t)(kf_group_bits * (kf_group_avg_error /
+                                                 vbr_corpus_complexity_lap));
     }
   } else {
     kf_group_bits = (int64_t)(twopass->bits_left *
diff --git a/test/simd_impl.h b/test/simd_impl.h
index f11c903b57..20737a0ed9 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -37,7 +37,7 @@ class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
 // Create one typedef for each function signature
 #define TYPEDEF_SIMD(name)                                             \
   typedef TestIntrinsic<std::tuple<uint32_t, uint32_t, const char *> > \
-      ARCH_POSTFIX(name)
+  ARCH_POSTFIX(name)
 
 TYPEDEF_SIMD(V64_U8);
 TYPEDEF_SIMD(V64_U16);
-- 
GitLab


From 466e282391f0ff4c53030422b0d088442dd5f472 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 5 Aug 2024 17:15:12 +0100
Subject: [PATCH 338/391] Add Arm Neon USMMLA impl. for 6-tap
 aom_convolve8_horiz

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

We use this new USMMLA 6-tap path for 4-tap filters as well since it
uses exactly the same number of instructions as the previous USDOT
implementation.

Change-Id: I36ba48eebb54dba7a7717875b2e83985b3b036d3
---
 aom_dsp/arm/aom_convolve8_neon_i8mm.c | 84 +++++++++++++++------------
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 5b9b88e757..121e89213d 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -25,6 +25,13 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
+DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
+  // clang-format on
+};
+
 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
@@ -136,60 +143,61 @@ static inline void convolve8_horiz_8tap_neon_i8mm(
   }
 }
 
-static inline int16x4_t convolve4_4_h(const uint8x16_t samples,
-                                      const int8x8_t filters,
+static inline int16x4_t convolve6_4_h(const uint8x16_t samples,
+                                      const int8x16_t filter,
                                       const uint8x16_t permute_tbl) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  int32x4_t sum =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter);
 
   // Further narrowing and packing is performed by the caller.
   return vmovn_s32(sum);
 }
 
-static inline uint8x8_t convolve4_8_h(const uint8x16_t samples,
-                                      const int8x8_t filters,
+static inline uint8x8_t convolve6_8_h(const uint8x16_t samples,
+                                      const int8x16_t filter,
                                       const uint8x16x2_t permute_tbl) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
-  // First 4 output values.
-  int32x4_t sum0 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  // Second 4 output values.
-  int32x4_t sum1 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter);
+  int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
   // We halved the filter values so -1 from right shift.
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static inline void convolve8_horiz_4tap_neon_i8mm(
+static inline void convolve8_horiz_6tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
-  const int16x4_t x_filter = vld1_s16(filter_x + 2);
-  // All 4-tap and bilinear filter values are even, so halve them to reduce
-  // intermediate precision requirements.
-  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t filter =
+      vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter);
 
   if (width == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    const uint8x16_t perm_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl);
-      int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl);
-      int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl);
-      int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl);
+      int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl);
+      int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl);
+      int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl);
+      int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl);
       // We halved the filter values so -1 from right shift.
       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
@@ -202,7 +210,7 @@ static inline void convolve8_horiz_4tap_neon_i8mm(
       height -= 4;
     } while (height > 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
 
     do {
       int w = width;
@@ -212,10 +220,10 @@ static inline void convolve8_horiz_4tap_neon_i8mm(
         uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl);
-        uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl);
-        uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl);
-        uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -249,8 +257,8 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   if (filter_taps == 2) {
     convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
                               h);
-  } else if (filter_taps == 4) {
-    convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
+  } else if (filter_taps <= 6) {
+    convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride,
                                    filter_x, w, h);
   } else {
     convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
-- 
GitLab


From 6e14f9069e58c9abc7ec4277d6e312116ac65b64 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 7 Aug 2024 11:57:21 +0100
Subject: [PATCH 339/391] Use Arm Neon USMMLA 6-tap impl. for 4-tap
 convolve_x_sr

The 6-tap USMMLA implementation of convolve_x_sr uses the same number
of instructions as the 4-tap USDOT implementation, so delete the
4-tap USDOT path and use the 6-tap USMMLA implementation in both
cases.

Change-Id: Ic390abea63047af623a2ab232532b6b36360293e
---
 av1/common/arm/convolve_neon_i8mm.c | 118 +++++++---------------------
 1 file changed, 29 insertions(+), 89 deletions(-)

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index dd4a34e0b0..acd912e575 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -213,6 +213,22 @@ static inline void convolve_x_sr_8tap_neon_i8mm(
   } while (height != 0);
 }
 
+static inline int16x4_t convolve6_4_x(uint8x16_t samples,
+                                      const int8x16_t filter,
+                                      const uint8x16_t permute_tbl,
+                                      const int32x4_t horiz_const) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
 static inline uint8x8_t convolve6_8_x(uint8x16_t samples,
                                       const int8x16_t filter,
                                       const uint8x16x2_t permute_tbl,
@@ -244,86 +260,16 @@ static inline void convolve_x_sr_6tap_neon_i8mm(
   const int8x16_t x_filter =
       vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
-  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
-  do {
-    const uint8_t *s = src;
-    uint8_t *d = dst;
-    int w = width;
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-      uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const);
-      uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const);
-
-      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-      s += 8;
-      d += 8;
-      w -= 8;
-    } while (w != 0);
-    src += 4 * src_stride;
-    dst += 4 * dst_stride;
-    height -= 4;
-  } while (height != 0);
-}
-
-static inline int16x4_t convolve4_4_x(const uint8x16_t samples,
-                                      const int8x8_t filters,
-                                      const uint8x16_t permute_tbl,
-                                      const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
-
-  int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0);
-
-  // Further narrowing and packing is performed by the caller.
-  return vmovn_s32(sum);
-}
-
-static inline uint8x8_t convolve4_8_x(const uint8x16_t samples,
-                                      const int8x8_t filters,
-                                      const uint8x16x2_t permute_tbl,
-                                      const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
-
-  int32x4_t acc = horiz_const;
-  int32x4_t sum0123 = vusdotq_lane_s32(acc, perm_samples[0], filters, 0);
-  int32x4_t sum4567 = vusdotq_lane_s32(acc, perm_samples[1], filters, 0);
-
-  // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
-  // We halved the filter values so -1 from right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static inline void convolve_x_sr_4tap_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
-    const int32x4_t horiz_const) {
-  const int16x4_t x_filter = vld1_s16(filter_x + 2);
-  // All 4-tap and bilinear filter values are even, so halve them to reduce
-  // intermediate precision requirements.
-  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
-
   if (width == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t t0 = convolve4_4_x(s0, filter, perm_tbl, horiz_const);
-      int16x4_t t1 = convolve4_4_x(s1, filter, perm_tbl, horiz_const);
-      int16x4_t t2 = convolve4_4_x(s2, filter, perm_tbl, horiz_const);
-      int16x4_t t3 = convolve4_4_x(s3, filter, perm_tbl, horiz_const);
+      int16x4_t t0 = convolve6_4_x(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t t1 = convolve6_4_x(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t t2 = convolve6_4_x(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t t3 = convolve6_4_x(s3, x_filter, permute_tbl, horiz_const);
       // We halved the filter values so -1 from right shift.
       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
@@ -336,20 +282,20 @@ static inline void convolve_x_sr_4tap_neon_i8mm(
       height -= 4;
     } while (height != 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
-
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
     do {
-      int w = width;
       const uint8_t *s = src;
       uint8_t *d = dst;
+      int w = width;
+
       do {
         uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        uint8x8_t d0 = convolve4_8_x(s0, filter, perm_tbl, horiz_const);
-        uint8x8_t d1 = convolve4_8_x(s1, filter, perm_tbl, horiz_const);
-        uint8x8_t d2 = convolve4_8_x(s2, filter, perm_tbl, horiz_const);
-        uint8x8_t d3 = convolve4_8_x(s3, filter, perm_tbl, horiz_const);
+        uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -390,7 +336,7 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
   // Halve the total because we will halve the filter values.
   const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2));
 
-  if (filter_taps == 6) {
+  if (filter_taps <= 6) {
     convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h,
                                  x_filter_ptr, horiz_const);
     return;
@@ -402,12 +348,6 @@ void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
     return;
   }
 
-  if (filter_taps <= 4) {
-    convolve_x_sr_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, w, h,
-                                 x_filter_ptr, horiz_const);
-    return;
-  }
-
   convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
                                x_filter_ptr, horiz_const);
 }
-- 
GitLab


From b94952a480b8882e45062ff464e1b5cb838c68e2 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Thu, 8 Aug 2024 17:17:44 +0100
Subject: [PATCH 340/391] Add Arm Neon USMMLA impl. for 6-tap
 dist_wtd_convolve_2d

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate the
horizontal pass of dist_wtd_convolve_2d for 6-tap filters. The 2x8 by
8x2 matrix multiply instruction does twice the work of a USDOT dot
product instruction.

We also use this new USMMLA 6-tap path for 4-tap filters since it
uses exactly the same number of instructions as the previous USDOT
implementation.

Change-Id: Ia129e7ec926a58932a4b97d4f51b0780f5842550
---
 av1/common/arm/compound_convolve_neon_i8mm.c | 204 ++++++++++++++-----
 1 file changed, 151 insertions(+), 53 deletions(-)

diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index 65f48958f1..0ed5c911a8 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -23,50 +23,51 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-static inline int16x4_t convolve4_4_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
+DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
+  // clang-format on
+};
+
+static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples,
+                                         const int8x16_t x_filter,
                                          const uint8x16_t permute_tbl,
                                          const int32x4_t horiz_const) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
   uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  // First 4 output values.
-  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(horiz_const, permuted_samples, x_filter);
 
   // We halved the convolution filter values so -1 from the right shift.
   return vshrn_n_s32(sum, ROUND0_BITS - 1);
 }
 
-static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
-                                         const uint8x16x3_t permute_tbl,
+static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples,
+                                         const int8x16_t x_filter,
+                                         const uint8x16x2_t permute_tbl,
                                          const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
 
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  // First 4 output values.
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
-  // Second 4 output values.
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, permuted_samples[0], x_filter);
+  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, permuted_samples[1], x_filter);
 
   // Narrow and re-pack.
   // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
 }
 
-static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
+static inline void dist_wtd_convolve_2d_horiz_6tap_neon_i8mm(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16_t *x_filter_ptr, const int im_h, int w) {
   const int bd = 8;
@@ -76,28 +77,28 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
   const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
                                             (1 << ((ROUND0_BITS - 1) - 1)));
 
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
+
   const uint8_t *src_ptr = src;
   int16_t *dst_ptr = im_block;
   int dst_stride = im_stride;
   int height = im_h;
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
-
-    src_ptr += 2;
-
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
 
-      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-      int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-      int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+      int16x4_t d0 = convolve6_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve6_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve6_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve6_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
 
       store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
 
@@ -109,7 +110,7 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
     do {
       uint8x16_t s0 = vld1q_u8(src_ptr);
 
-      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d0 = convolve6_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
 
       vst1_s16(dst_ptr, d0);
 
@@ -117,10 +118,7 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
       dst_ptr += dst_stride;
     } while (--height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
     do {
       const uint8_t *s = src_ptr;
       int16_t *d = dst_ptr;
@@ -130,10 +128,10 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
         uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+        int16x8_t d0 = convolve6_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 = convolve6_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 = convolve6_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 = convolve6_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
 
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -154,7 +152,7 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
       do {
         uint8x16_t s0 = vld1q_u8(s);
 
-        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d0 = convolve6_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
 
         vst1q_s16(d, d0);
 
@@ -168,6 +166,99 @@ static inline void dist_wtd_convolve_2d_horiz_neon_i8mm(
   }
 }
 
+static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+                                         const int8x8_t x_filter,
+                                         const uint8x16x3_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
+  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+  // Second 4 output values.
+  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
+  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static inline void dist_wtd_convolve_2d_horiz_8tap_neon_i8mm(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16_t *x_filter_ptr, const int im_h, int w) {
+  const int bd = 8;
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x16_t s0 = vld1q_u8(s);
+
+      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+      vst1q_s16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
+}
+
 void av1_dist_wtd_convolve_2d_neon_i8mm(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
@@ -179,13 +270,15 @@ void av1_dist_wtd_convolve_2d_neon_i8mm(
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
 
   const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
   const int vert_offset = clamped_y_taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int horiz_offset = clamped_x_taps / 2 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
@@ -194,8 +287,13 @@ void av1_dist_wtd_convolve_2d_neon_i8mm(
 
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-  dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
-                                       x_filter_ptr, im_h, w);
+  if (clamped_x_taps == 6) {
+    dist_wtd_convolve_2d_horiz_6tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                              im_stride, x_filter_ptr, im_h, w);
+  } else {
+    dist_wtd_convolve_2d_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                              im_stride, x_filter_ptr, im_h, w);
+  }
 
   if (clamped_y_taps == 6) {
     if (conv_params->do_average) {
-- 
GitLab


From 9453443c0542cf97f76d0602198c3164641de011 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 8 Aug 2024 20:29:57 -0700
Subject: [PATCH 341/391] Delete the AOM_INLINE macro

Replace AOM_INLINE with inline and delete the AOM_INLINE macro.

Although AOM_INLINE is defined in the public header aom/aom_integer.h,
it is clearly for libaom internal use. Clients of libaom are unlikely to
use AOM_INLINE. So it should be safe to delete AOM_INLINE.

Bug: aomedia:358402891
Change-Id: Ib595cdff1690c4027bd7e61580beb97d0c22865c
---
 aom/aom_integer.h                            |   1 -
 aom/internal/aom_codec_internal.h            |   2 +-
 aom_dsp/entenc.h                             |  11 +-
 aom_dsp/mathutils.h                          |   2 +-
 aom_dsp/x86/sum_squares_avx2.c               |   4 +-
 aom_dsp/x86/sum_squares_sse2.c               |   4 +-
 aom_dsp/x86/variance_impl_avx2.c             |   2 +-
 aom_ports/mem_ops.h                          |  14 +-
 aom_ports/mem_ops_aligned.h                  |  70 ++--
 aom_scale/yv12config.h                       |   2 +-
 av1/av1_iface_common.h                       |  10 +-
 av1/common/arm/highbd_wiener_convolve_neon.c |   2 +-
 av1/common/arm/reconinter_neon.c             |  18 +-
 av1/common/arm/wiener_convolve_neon.c        |   2 +-
 av1/common/av1_loopfilter.c                  |  34 +-
 av1/common/cdef_block.c                      |   8 +-
 av1/common/entropy.c                         |   7 +-
 av1/common/filter.h                          |   4 +-
 av1/common/mv.h                              |   6 +-
 av1/common/mvref_common.c                    |  54 +--
 av1/common/reconinter.c                      |  39 +-
 av1/common/reconinter.h                      |  25 +-
 av1/common/reconinter_template.inc           |  51 +--
 av1/common/seg_common.h                      |   6 +-
 av1/common/thread_common.c                   |  24 +-
 av1/common/thread_common.h                   |  33 +-
 av1/common/x86/convolve_avx2.c               |   4 +-
 av1/common/x86/jnt_convolve_avx2.c           |   2 +-
 av1/decoder/decodeframe.c                    | 403 +++++++++----------
 av1/encoder/arm/shift_neon.h                 |  14 +-
 av1/encoder/arm/txfm_neon.h                  |   8 +-
 av1/encoder/bitstream.c                      | 380 +++++++++--------
 av1/encoder/compound_type.c                  |  10 +-
 av1/encoder/context_tree.h                   |   4 +-
 av1/encoder/encode_strategy.h                |   4 +-
 av1/encoder/encodeframe.c                    |  81 ++--
 av1/encoder/encodeframe_utils.c              |   6 +-
 av1/encoder/encodeframe_utils.h              |  45 +--
 av1/encoder/encoder.c                        |  10 +-
 av1/encoder/encoder.h                        |   8 +-
 av1/encoder/encoder_alloc.h                  |  35 +-
 av1/encoder/encoder_utils.h                  |  70 ++--
 av1/encoder/ethread.c                        | 160 ++++----
 av1/encoder/firstpass.c                      |  19 +-
 av1/encoder/global_motion_facade.c           |  10 +-
 av1/encoder/global_motion_facade.h           |   4 +-
 av1/encoder/gop_structure.c                  |  19 +-
 av1/encoder/intra_mode_search.c              |  21 +-
 av1/encoder/intra_mode_search.h              |   2 +-
 av1/encoder/intra_mode_search_utils.h        |  84 ++--
 av1/encoder/mcomp.c                          |  28 +-
 av1/encoder/mcomp.h                          |   2 +-
 av1/encoder/model_rd.h                       |  38 +-
 av1/encoder/motion_search_facade.c           |   9 +-
 av1/encoder/motion_search_facade.h           |   8 +-
 av1/encoder/mv_prec.c                        |  35 +-
 av1/encoder/mv_prec.h                        |   8 +-
 av1/encoder/nonrd_opt.c                      |   5 +-
 av1/encoder/nonrd_pickmode.c                 |  68 ++--
 av1/encoder/palette.c                        |  37 +-
 av1/encoder/partition_search.c               |  44 +-
 av1/encoder/partition_search.h               |  12 +-
 av1/encoder/partition_strategy.c             |  10 +-
 av1/encoder/partition_strategy.h             |  10 +-
 av1/encoder/pickcdef.c                       |  10 +-
 av1/encoder/pickrst.c                        | 127 +++---
 av1/encoder/rc_utils.h                       |  46 +--
 av1/encoder/rd.c                             |   2 +-
 av1/encoder/rdopt.c                          | 138 +++----
 av1/encoder/rdopt_utils.h                    |  38 +-
 av1/encoder/reconinter_enc.c                 |   2 +-
 av1/encoder/sorting_network.h                |   4 +-
 av1/encoder/speed_features.c                 |  32 +-
 av1/encoder/temporal_filter.h                |  15 +-
 av1/encoder/thirdpass.c                      |   2 +-
 av1/encoder/tokenize.c                       |   7 +-
 av1/encoder/tokenize.h                       |   8 +-
 av1/encoder/tpl_model.c                      |  97 +++--
 av1/encoder/tpl_model.h                      |   6 +-
 av1/encoder/tune_vmaf.c                      |  56 +--
 av1/encoder/tx_search.c                      | 139 +++----
 av1/encoder/tx_search.h                      |   4 +-
 av1/encoder/var_based_part.c                 | 108 ++---
 av1/encoder/x86/error_intrin_sse2.c          |   2 +-
 av1/encoder/x86/ml_sse3.c                    |   6 +-
 av1/encoder/x86/pickrst_avx2.c               |  30 +-
 av1/encoder/x86/pickrst_sse4.c               |  28 +-
 av1/encoder/x86/temporal_filter_avx2.c       |   2 +-
 tools/auto_refactor/av1_preprocess.py        |   2 -
 89 files changed, 1516 insertions(+), 1547 deletions(-)

diff --git a/aom/aom_integer.h b/aom/aom_integer.h
index c284947e4d..9660301e51 100644
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -19,7 +19,6 @@
 #else
 #define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
 #endif
-#define AOM_INLINE inline
 
 /* Assume platforms have the C99 standard integer types. */
 
diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h
index d2af212ee8..ddd986120c 100644
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h
@@ -189,7 +189,7 @@ typedef const struct aom_codec_ctrl_fn_map {
 #define CTRL_MAP_END \
   { 0, NULL }
 
-static AOM_INLINE int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
+static inline int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
   return e->ctrl_id == 0 && e->fn == NULL;
 }
 
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index c52088b843..fe8a38f17c 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -75,7 +75,7 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
     OD_ARG_NONNULL(1);
 
 // buf is the frame bitbuffer, offs is where carry to be added
-static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
+static inline void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
   uint16_t sum, carry = 1;
   do {
     sum = (uint16_t)buf[offs] + 1;
@@ -86,11 +86,10 @@ static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
 
 // Convert to big-endian byte order and write data to buffer adding the
 // carry-bit
-static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
-                                                 uint32_t offs, uint64_t output,
-                                                 uint64_t carry,
-                                                 uint32_t *enc_offs,
-                                                 uint8_t num_bytes_ready) {
+static inline void write_enc_data_to_out_buf(unsigned char *out, uint32_t offs,
+                                             uint64_t output, uint64_t carry,
+                                             uint32_t *enc_offs,
+                                             uint8_t num_bytes_ready) {
   const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3));
   memcpy(&out[offs], &reg, 8);
   // Propagate carry backwards if exists
diff --git a/aom_dsp/mathutils.h b/aom_dsp/mathutils.h
index 746585d6aa..d15569c63f 100644
--- a/aom_dsp/mathutils.h
+++ b/aom_dsp/mathutils.h
@@ -126,7 +126,7 @@ static inline void multiply_mat(const double *m1, const double *m2, double *res,
   }
 }
 
-static AOM_INLINE float approx_exp(float y) {
+static inline float approx_exp(float y) {
 #define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
 #define B \
   127  // Offset for the exponent according to IEEE floating point standard.
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 7ae58eef59..e2f5327efc 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -157,7 +157,7 @@ uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
 }
 
 // Accumulate sum of 16-bit elements in the vector
-static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+static inline int32_t mm256_accumulate_epi16(__m256i vec_a) {
   __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
   __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
   vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
@@ -171,7 +171,7 @@ static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
 }
 
 // Accumulate sum of 32-bit elements in the vector
-static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+static inline int32_t mm256_accumulate_epi32(__m256i vec_a) {
   __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
   __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
   vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 1f8ef656ed..b12c10d8e6 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -315,7 +315,7 @@ uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
 }
 
 // Accumulate sum of 16-bit elements in the vector
-static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+static inline int32_t mm_accumulate_epi16(__m128i vec_a) {
   __m128i vtmp = _mm_srli_si128(vec_a, 8);
   vec_a = _mm_add_epi16(vec_a, vtmp);
   vtmp = _mm_srli_si128(vec_a, 4);
@@ -326,7 +326,7 @@ static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
 }
 
 // Accumulate sum of 32-bit elements in the vector
-static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+static inline int32_t mm_accumulate_epi32(__m128i vec_a) {
   __m128i vtmp = _mm_srli_si128(vec_a, 8);
   vec_a = _mm_add_epi32(vec_a, vtmp);
   vtmp = _mm_srli_si128(vec_a, 4);
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index 39e3fcf14f..d402697501 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -168,7 +168,7 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 // binary size by optimizing the loops more carefully without duplicating the
 // codes with a macro.
 #define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
-  static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2(         \
+  static inline int aom_sub_pixel_variance32x##height##_imp_avx2(             \
       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
     __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
diff --git a/aom_ports/mem_ops.h b/aom_ports/mem_ops.h
index 4e32fd51a5..e59246ea5b 100644
--- a/aom_ports/mem_ops.h
+++ b/aom_ports/mem_ops.h
@@ -134,7 +134,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
 }
 
 #define mem_get_s_generic(end, sz)                                            \
-  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \
+  static inline signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {     \
     const MAU_T *mem = (const MAU_T *)vmem;                                   \
     signed MEM_VALUE_T val = mem_get_##end##sz(mem);                          \
     return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \
@@ -167,7 +167,7 @@ mem_get_s_generic(le, 32)
 
 #undef  mem_put_be16
 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
-static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_be16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >> 8) & 0xff);
@@ -176,7 +176,7 @@ static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_be24
 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
-static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_be24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >> 16) & 0xff);
@@ -186,7 +186,7 @@ static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_be32
 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
-static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_be32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >> 24) & 0xff);
@@ -197,7 +197,7 @@ static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le16
 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
-static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_le16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >> 0) & 0xff);
@@ -206,7 +206,7 @@ static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le24
 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
-static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_le24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >>  0) & 0xff);
@@ -216,7 +216,7 @@ static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le32
 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
-static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+static inline void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (MAU_T)((val >>  0) & 0xff);
diff --git a/aom_ports/mem_ops_aligned.h b/aom_ports/mem_ops_aligned.h
index 411133d4ef..bdfb034859 100644
--- a/aom_ports/mem_ops_aligned.h
+++ b/aom_ports/mem_ops_aligned.h
@@ -44,51 +44,51 @@
   } while (0)
 #define swap_endian_32_se(val, raw) swap_endian_32(val, raw)
 
-#define mem_get_ne_aligned_generic(end, sz)                           \
-  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
-      const void *vmem) {                                             \
-    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
-    return *mem;                                                      \
+#define mem_get_ne_aligned_generic(end, sz)                       \
+  static inline unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                         \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;         \
+    return *mem;                                                  \
   }
 
-#define mem_get_sne_aligned_generic(end, sz)                         \
-  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
-      const void *vmem) {                                            \
-    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
-    return *mem;                                                     \
+#define mem_get_sne_aligned_generic(end, sz)                     \
+  static inline signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                        \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;          \
+    return *mem;                                                 \
   }
 
-#define mem_get_se_aligned_generic(end, sz)                           \
-  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
-      const void *vmem) {                                             \
-    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
-    unsigned MEM_VALUE_T val, raw = *mem;                             \
-    swap_endian_##sz(val, raw);                                       \
-    return val;                                                       \
+#define mem_get_se_aligned_generic(end, sz)                       \
+  static inline unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                         \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;         \
+    unsigned MEM_VALUE_T val, raw = *mem;                         \
+    swap_endian_##sz(val, raw);                                   \
+    return val;                                                   \
   }
 
-#define mem_get_sse_aligned_generic(end, sz)                         \
-  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
-      const void *vmem) {                                            \
-    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
-    unsigned MEM_VALUE_T val, raw = *mem;                            \
-    swap_endian_##sz##_se(val, raw);                                 \
-    return val;                                                      \
+#define mem_get_sse_aligned_generic(end, sz)                     \
+  static inline signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                        \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;          \
+    unsigned MEM_VALUE_T val, raw = *mem;                        \
+    swap_endian_##sz##_se(val, raw);                             \
+    return val;                                                  \
   }
 
-#define mem_put_ne_aligned_generic(end, sz)                             \
-  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
-                                                     MEM_VALUE_T val) { \
-    uint##sz##_t *mem = (uint##sz##_t *)vmem;                           \
-    *mem = (uint##sz##_t)val;                                           \
+#define mem_put_ne_aligned_generic(end, sz)                         \
+  static inline void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                 MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem;                       \
+    *mem = (uint##sz##_t)val;                                       \
   }
 
-#define mem_put_se_aligned_generic(end, sz)                             \
-  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
-                                                     MEM_VALUE_T val) { \
-    uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;                      \
-    swap_endian_##sz(raw, val);                                         \
-    *mem = (uint##sz##_t)raw;                                           \
+#define mem_put_se_aligned_generic(end, sz)                         \
+  static inline void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                 MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;                  \
+    swap_endian_##sz(raw, val);                                     \
+    *mem = (uint##sz##_t)raw;                                       \
   }
 
 #include "config/aom_config.h"
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index 78fe1512e5..a29125db2f 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -212,7 +212,7 @@ int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
  * \param[in]    aligned_width       Aligned width of the image
  * \param[in]    border              Border in pixels
  */
-static AOM_INLINE int aom_calc_y_stride(int aligned_width, int border) {
+static inline int aom_calc_y_stride(int aligned_width, int border) {
   return ((aligned_width + 2 * border) + 31) & ~31;
 }
 
diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index c1a2a5b252..e47f8b07f8 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h
@@ -18,9 +18,9 @@
 
 extern aom_codec_iface_t aom_codec_av1_inspect_algo;
 
-static AOM_INLINE void yuvconfig2image(aom_image_t *img,
-                                       const YV12_BUFFER_CONFIG *yv12,
-                                       void *user_priv) {
+static inline void yuvconfig2image(aom_image_t *img,
+                                   const YV12_BUFFER_CONFIG *yv12,
+                                   void *user_priv) {
   /* aom_img_wrap() doesn't allow specifying independent strides for
    * the Y, U, and V planes, nor other alignment adjustments that
    * might be representable by a YV12_BUFFER_CONFIG, so we just
@@ -83,8 +83,8 @@ static AOM_INLINE void yuvconfig2image(aom_image_t *img,
   img->metadata = NULL;
 }
 
-static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img,
-                                                  YV12_BUFFER_CONFIG *yv12) {
+static inline aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                              YV12_BUFFER_CONFIG *yv12) {
   yv12->y_buffer = img->planes[AOM_PLANE_Y];
   yv12->u_buffer = img->planes[AOM_PLANE_U];
   yv12->v_buffer = img->planes[AOM_PLANE_V];
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index 044a5f80ba..ac941437d5 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -309,7 +309,7 @@ HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
 
 #undef HBD_WIENER_7TAP_VERT
 
-static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+static inline int get_wiener_filter_taps(const int16_t *filter) {
   assert(filter[7] == 0);
   if (filter[0] == 0 && filter[6] == 0) {
     return WIENER_WIN_REDUCED;
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
index a7f368948b..96a36c79d7 100644
--- a/av1/common/arm/reconinter_neon.c
+++ b/av1/common/arm/reconinter_neon.c
@@ -21,10 +21,12 @@
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"
 
-static AOM_INLINE void diffwtd_mask_d16_neon(
-    uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0,
-    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
-    ConvolveParams *conv_params, int bd) {
+static inline void diffwtd_mask_d16_neon(uint8_t *mask, const bool inverse,
+                                         const CONV_BUF_TYPE *src0,
+                                         int src0_stride,
+                                         const CONV_BUF_TYPE *src1,
+                                         int src1_stride, int h, int w,
+                                         ConvolveParams *conv_params, int bd) {
   const int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
@@ -125,10 +127,10 @@ void av1_build_compound_diffwtd_mask_d16_neon(
   }
 }
 
-static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
-                                         const uint8_t *src0, int src0_stride,
-                                         const uint8_t *src1, int src1_stride,
-                                         int h, int w) {
+static inline void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
+                                     const uint8_t *src0, int src0_stride,
+                                     const uint8_t *src1, int src1_stride,
+                                     int h, int w) {
   if (w >= 16) {
     int i = 0;
     do {
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index e2887b95d8..575db59e48 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -275,7 +275,7 @@ static inline void convolve_add_src_vert_7tap_neon(
   } while (w != 0);
 }
 
-static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+static inline int get_wiener_filter_taps(const int16_t *filter) {
   assert(filter[7] == 0);
   if (filter[0] == 0 && filter[6] == 0) {
     return WIENER_WIN_REDUCED;
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index a6e3bb22c2..83549597e1 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -903,10 +903,10 @@ static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma(
   }
 }
 
-static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride,
-                                   const AV1_DEBLOCKING_PARAMETERS *params,
-                                   const SequenceHeader *seq_params,
-                                   USE_FILTER_TYPE use_filter_type) {
+static inline void filter_vert(uint8_t *dst, int dst_stride,
+                               const AV1_DEBLOCKING_PARAMETERS *params,
+                               const SequenceHeader *seq_params,
+                               USE_FILTER_TYPE use_filter_type) {
   const loop_filter_thresh *limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
   const int use_highbitdepth = seq_params->use_highbitdepth;
@@ -1109,10 +1109,11 @@ static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride,
 #endif  // !CONFIG_AV1_HIGHBITDEPTH
 }
 
-static AOM_INLINE void filter_vert_chroma(
-    uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
-    const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
-    USE_FILTER_TYPE use_filter_type) {
+static inline void filter_vert_chroma(uint8_t *u_dst, uint8_t *v_dst,
+                                      int dst_stride,
+                                      const AV1_DEBLOCKING_PARAMETERS *params,
+                                      const SequenceHeader *seq_params,
+                                      USE_FILTER_TYPE use_filter_type) {
   const loop_filter_thresh *u_limits = params->lfthr;
   const loop_filter_thresh *v_limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1504,10 +1505,10 @@ void av1_filter_block_plane_vert_opt_chroma(
   }
 }
 
-static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride,
-                                   const AV1_DEBLOCKING_PARAMETERS *params,
-                                   const SequenceHeader *seq_params,
-                                   USE_FILTER_TYPE use_filter_type) {
+static inline void filter_horz(uint8_t *dst, int dst_stride,
+                               const AV1_DEBLOCKING_PARAMETERS *params,
+                               const SequenceHeader *seq_params,
+                               USE_FILTER_TYPE use_filter_type) {
   const loop_filter_thresh *limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
   const int use_highbitdepth = seq_params->use_highbitdepth;
@@ -1710,10 +1711,11 @@ static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride,
 #endif  // !CONFIG_AV1_HIGHBITDEPTH
 }
 
-static AOM_INLINE void filter_horz_chroma(
-    uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
-    const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
-    USE_FILTER_TYPE use_filter_type) {
+static inline void filter_horz_chroma(uint8_t *u_dst, uint8_t *v_dst,
+                                      int dst_stride,
+                                      const AV1_DEBLOCKING_PARAMETERS *params,
+                                      const SequenceHeader *seq_params,
+                                      USE_FILTER_TYPE use_filter_type) {
   const loop_filter_thresh *u_limits = params->lfthr;
   const loop_filter_thresh *v_limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index ad269c7850..318779beda 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -292,10 +292,10 @@ static inline int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-static AOM_INLINE void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist,
-                                         int var[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                                         int cdef_count, int coeff_shift,
-                                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
+static inline void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist,
+                                     int var[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                                     int cdef_count, int coeff_shift,
+                                     int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
   int bi;
 
   // Find direction of two 8x8 blocks together.
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 8c5d675e4c..9b76ce5dcc 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -50,9 +50,8 @@ void av1_default_coef_probs(AV1_COMMON *cm) {
   av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
 }
 
-static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
-                                                int num_cdfs, int cdf_stride,
-                                                int nsymbs) {
+static inline void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
+                                            int cdf_stride, int nsymbs) {
   for (int i = 0; i < num_cdfs; i++) {
     cdf_ptr[i * cdf_stride + nsymbs] = 0;
   }
@@ -69,7 +68,7 @@ static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
     reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
   } while (0)
 
-static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
+static inline void reset_nmv_counter(nmv_context *nmv) {
   RESET_CDF_COUNTER(nmv->joints_cdf, 4);
   for (int i = 0; i < 2; i++) {
     RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 752b18a59b..7073da2ad4 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -294,8 +294,8 @@ static inline uint8_t get_interp_filter_allowed_mask(
   return (allow_interp_mask >> filt_type) & 1;
 }
 
-static AOM_INLINE int get_filter_tap(
-    const InterpFilterParams *const filter_params, int subpel_qn) {
+static inline int get_filter_tap(const InterpFilterParams *const filter_params,
+                                 int subpel_qn) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
       filter_params, subpel_qn & SUBPEL_MASK);
   if (filter_params->taps == 12) {
diff --git a/av1/common/mv.h b/av1/common/mv.h
index b731bc875d..0f4fa9c2e8 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -76,19 +76,19 @@ typedef struct {
   int row_max;
 } SubpelMvLimits;
 
-static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+static inline FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
   const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
                                (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
   return full_mv;
 }
 
-static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+static inline MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
   const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
                          (int16_t)GET_MV_SUBPEL(full_mv->col) };
   return subpel_mv;
 }
 
-static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+static inline void convert_fullmv_to_mv(int_mv *mv) {
   mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
 }
 
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index b07a9b1a2b..c27fceb6b3 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -23,7 +23,7 @@ static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
 
 // TODO(jingning): Consider the use of lookup table for (num / den)
 // altogether.
-static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
+static inline void get_mv_projection(MV *output, MV ref, int num, int den) {
   den = AOMMIN(den, MAX_FRAME_DISTANCE);
   num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
                 : AOMMAX(num, -MAX_FRAME_DISTANCE);
@@ -71,7 +71,7 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void add_ref_mv_candidate(
+static inline void add_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
     uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
     CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
@@ -139,12 +139,13 @@ static AOM_INLINE void add_ref_mv_candidate(
   }
 }
 
-static AOM_INLINE void scan_row_mbmi(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
-    const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
-    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
-    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
-    int *processed_rows) {
+static inline void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                 int mi_col, const MV_REFERENCE_FRAME rf[2],
+                                 int row_offset, CANDIDATE_MV *ref_mv_stack,
+                                 uint16_t *ref_mv_weight, uint8_t *refmv_count,
+                                 uint8_t *ref_match_count, uint8_t *newmv_count,
+                                 int_mv *gm_mv_candidates, int max_row_offset,
+                                 int *processed_rows) {
   int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
   end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
   const int width_8x8 = mi_size_wide[BLOCK_8X8];
@@ -186,12 +187,13 @@ static AOM_INLINE void scan_row_mbmi(
   }
 }
 
-static AOM_INLINE void scan_col_mbmi(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
-    const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
-    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
-    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
-    int *processed_cols) {
+static inline void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                 int mi_row, const MV_REFERENCE_FRAME rf[2],
+                                 int col_offset, CANDIDATE_MV *ref_mv_stack,
+                                 uint16_t *ref_mv_weight, uint8_t *refmv_count,
+                                 uint8_t *ref_match_count, uint8_t *newmv_count,
+                                 int_mv *gm_mv_candidates, int max_col_offset,
+                                 int *processed_cols) {
   int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
   end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
@@ -233,12 +235,14 @@ static AOM_INLINE void scan_col_mbmi(
   }
 }
 
-static AOM_INLINE void scan_blk_mbmi(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
-    const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
-    int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
-    uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
-    uint8_t *refmv_count) {
+static inline void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                 const int mi_row, const int mi_col,
+                                 const MV_REFERENCE_FRAME rf[2], int row_offset,
+                                 int col_offset, CANDIDATE_MV *ref_mv_stack,
+                                 uint16_t *ref_mv_weight,
+                                 uint8_t *ref_match_count, uint8_t *newmv_count,
+                                 int_mv *gm_mv_candidates,
+                                 uint8_t *refmv_count) {
   const TileInfo *const tile = &xd->tile;
   POSITION mi_pos;
 
@@ -415,7 +419,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   return 1;
 }
 
-static AOM_INLINE void process_compound_ref_mv_candidate(
+static inline void process_compound_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
     const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
     int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
@@ -440,7 +444,7 @@ static AOM_INLINE void process_compound_ref_mv_candidate(
   }
 }
 
-static AOM_INLINE void process_single_ref_mv_candidate(
+static inline void process_single_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
     MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
     CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
@@ -471,7 +475,7 @@ static AOM_INLINE void process_single_ref_mv_candidate(
   }
 }
 
-static AOM_INLINE void setup_ref_mv_list(
+static inline void setup_ref_mv_list(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
     uint8_t *const refmv_count,
     CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
@@ -1327,8 +1331,8 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
   return info_a->map_idx - info_b->map_idx;
 }
 
-static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
-                                          REF_FRAME_INFO *ref_info) {
+static inline void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+                                      REF_FRAME_INFO *ref_info) {
   assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
   remapped_ref_idx[frame_idx] = ref_info->map_idx;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 262fec72e9..ec43becc39 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -139,8 +139,8 @@ static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
 
-static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
-                                  int width) {
+static inline void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+                              int width) {
   if (shift >= 0) {
     memcpy(dst + shift, src, width - shift);
     memset(dst, src[0], shift);
@@ -293,10 +293,11 @@ const uint8_t *av1_get_compound_type_mask(
   }
 }
 
-static AOM_INLINE void diffwtd_mask_d16(
-    uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
-    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
-    ConvolveParams *conv_params, int bd) {
+static inline void diffwtd_mask_d16(uint8_t *mask, int which_inverse,
+                                    int mask_base, const CONV_BUF_TYPE *src0,
+                                    int src0_stride, const CONV_BUF_TYPE *src1,
+                                    int src1_stride, int h, int w,
+                                    ConvolveParams *conv_params, int bd) {
   int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   int i, j, m, diff;
@@ -327,10 +328,10 @@ void av1_build_compound_diffwtd_mask_d16_c(
   }
 }
 
-static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
-                                    int mask_base, const uint8_t *src0,
-                                    int src0_stride, const uint8_t *src1,
-                                    int src1_stride, int h, int w) {
+static inline void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
+                                const uint8_t *src0, int src0_stride,
+                                const uint8_t *src1, int src1_stride, int h,
+                                int w) {
   int i, j, m, diff;
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
@@ -440,7 +441,7 @@ void av1_build_compound_diffwtd_mask_highbd_c(
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void init_wedge_master_masks(void) {
+static inline void init_wedge_master_masks(void) {
   int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
@@ -485,7 +486,7 @@ static AOM_INLINE void init_wedge_master_masks(void) {
   }
 }
 
-static AOM_INLINE void init_wedge_masks(void) {
+static inline void init_wedge_masks(void) {
   uint8_t *dst = wedge_mask_buf;
   BLOCK_SIZE bsize;
   memset(wedge_masks, 0, sizeof(wedge_masks));
@@ -531,9 +532,9 @@ static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
 };
 /* clang-format on */
 
-static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
-                                                    BLOCK_SIZE plane_bsize,
-                                                    INTERINTRA_MODE mode) {
+static inline void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                                BLOCK_SIZE plane_bsize,
+                                                INTERINTRA_MODE mode) {
   int i, j;
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
@@ -572,7 +573,7 @@ static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
   }
 }
 
-static AOM_INLINE void init_smooth_interintra_masks(void) {
+static inline void init_smooth_interintra_masks(void) {
   for (int m = 0; m < INTERINTRA_MODES; ++m) {
     for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
       const int bw = block_size_wide[bs];
@@ -593,7 +594,7 @@ static void init_all_wedge_masks(void) {
 
 void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); }
 
-static AOM_INLINE void build_masked_compound_no_round(
+static inline void build_masked_compound_no_round(
     uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
@@ -1046,7 +1047,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
       GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
 }
 
-static AOM_INLINE void combine_interintra(
+static inline void combine_interintra(
     INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
     int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
     uint8_t *comppred, int compstride, const uint8_t *interpred,
@@ -1073,7 +1074,7 @@ static AOM_INLINE void combine_interintra(
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void combine_interintra_highbd(
+static inline void combine_interintra_highbd(
     INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
     int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
     uint8_t *comppred8, int compstride, const uint8_t *interpred8,
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index b19f1f0635..ed06ee0bb8 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -128,9 +128,10 @@ typedef struct InterPredParams {
 } InterPredParams;
 
 // Initialize sub-pel params required for inter prediction.
-static AOM_INLINE void init_subpel_params(
-    const MV *const src_mv, InterPredParams *const inter_pred_params,
-    SubpelParams *subpel_params, int width, int height) {
+static inline void init_subpel_params(const MV *const src_mv,
+                                      InterPredParams *const inter_pred_params,
+                                      SubpelParams *subpel_params, int width,
+                                      int height) {
   const struct scale_factors *sf = inter_pred_params->scale_factors;
   int ssx = inter_pred_params->subsampling_x;
   int ssy = inter_pred_params->subsampling_y;
@@ -165,7 +166,7 @@ static AOM_INLINE void init_subpel_params(
 }
 
 // Initialize interp filter required for inter prediction.
-static AOM_INLINE void init_interp_filter_params(
+static inline void init_interp_filter_params(
     const InterpFilterParams *interp_filter_params[2],
     const InterpFilters *filter, int block_width, int block_height,
     int is_intrabc) {
@@ -181,7 +182,7 @@ static AOM_INLINE void init_interp_filter_params(
 }
 
 // Initialize parameters required for inter prediction at mode level.
-static AOM_INLINE void init_inter_mode_params(
+static inline void init_inter_mode_params(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
     SubpelParams *subpel_params, const struct scale_factors *sf, int width,
     int height) {
@@ -190,10 +191,12 @@ static AOM_INLINE void init_inter_mode_params(
 }
 
 // Initialize parameters required for inter prediction at block level.
-static AOM_INLINE void init_inter_block_params(
-    InterPredParams *inter_pred_params, int block_width, int block_height,
-    int pix_row, int pix_col, int subsampling_x, int subsampling_y,
-    int bit_depth, int use_hbd_buf, int is_intrabc) {
+static inline void init_inter_block_params(InterPredParams *inter_pred_params,
+                                           int block_width, int block_height,
+                                           int pix_row, int pix_col,
+                                           int subsampling_x, int subsampling_y,
+                                           int bit_depth, int use_hbd_buf,
+                                           int is_intrabc) {
   inter_pred_params->block_width = block_width;
   inter_pred_params->block_height = block_height;
   inter_pred_params->pix_row = pix_row;
@@ -210,7 +213,7 @@ static AOM_INLINE void init_inter_block_params(
 }
 
 // Initialize params required for inter prediction.
-static AOM_INLINE void av1_init_inter_params(
+static inline void av1_init_inter_params(
     InterPredParams *inter_pred_params, int block_width, int block_height,
     int pix_row, int pix_col, int subsampling_x, int subsampling_y,
     int bit_depth, int use_hbd_buf, int is_intrabc,
@@ -226,7 +229,7 @@ static AOM_INLINE void av1_init_inter_params(
   inter_pred_params->ref_frame_buf = *ref_buf;
 }
 
-static AOM_INLINE void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+static inline void av1_init_comp_mode(InterPredParams *inter_pred_params) {
   inter_pred_params->comp_mode = UNIFORM_COMP;
 }
 
diff --git a/av1/common/reconinter_template.inc b/av1/common/reconinter_template.inc
index 2a6161a366..2529023b5c 100644
--- a/av1/common/reconinter_template.inc
+++ b/av1/common/reconinter_template.inc
@@ -14,12 +14,14 @@
 #endif
 
 #if IS_DEC
-static AOM_INLINE void build_one_inter_predictor(
-    uint8_t *dst, int dst_stride, const MV *src_mv,
-    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, uint8_t **mc_buf) {
+static inline void build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                             const MV *src_mv,
+                                             InterPredParams *inter_pred_params,
+                                             MACROBLOCKD *xd, int mi_x,
+                                             int mi_y, int ref,
+                                             uint8_t **mc_buf) {
 #else
-static AOM_INLINE void build_one_inter_predictor(
+static inline void build_one_inter_predictor(
     uint8_t *dst, int dst_stride, const MV *src_mv,
     InterPredParams *inter_pred_params) {
 #endif  // IS_DEC
@@ -82,16 +84,16 @@ static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
 }
 
 #if IS_DEC
-static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
-                                                     MACROBLOCKD *xd, int plane,
-                                                     const MB_MODE_INFO *mi,
-                                                     int mi_x, int mi_y,
-                                                     uint8_t **mc_buf) {
+static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int plane,
+                                                 const MB_MODE_INFO *mi,
+                                                 int mi_x, int mi_y,
+                                                 uint8_t **mc_buf) {
 #else
-static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
-                                                     MACROBLOCKD *xd, int plane,
-                                                     const MB_MODE_INFO *mi,
-                                                     int mi_x, int mi_y) {
+static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int plane,
+                                                 const MB_MODE_INFO *mi,
+                                                 int mi_x, int mi_y) {
 #endif  // IS_DEC
   const BLOCK_SIZE bsize = mi->bsize;
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -160,11 +162,11 @@ static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
 }
 
 #if IS_DEC
-static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+static inline void build_inter_predictors_8x8_and_bigger(
     const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
     int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
 #else
-static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+static inline void build_inter_predictors_8x8_and_bigger(
     const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
     int build_for_obmc, int bw, int bh, int mi_x, int mi_y) {
 #endif  // IS_DEC
@@ -237,9 +239,11 @@ static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
 }
 
 #if IS_DEC
-static AOM_INLINE void build_inter_predictors(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
+static inline void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y,
+                                          uint8_t **mc_buf) {
   if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
                       build_for_obmc)) {
     assert(bw < 8 || bh < 8);
@@ -250,11 +254,10 @@ static AOM_INLINE void build_inter_predictors(
   }
 }
 #else
-static AOM_INLINE void build_inter_predictors(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, int plane,
-                                              const MB_MODE_INFO *mi,
-                                              int build_for_obmc, int bw,
-                                              int bh, int mi_x, int mi_y) {
+static inline void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y) {
   if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
                       build_for_obmc)) {
     assert(bw < 8 || bh < 8);
diff --git a/av1/common/seg_common.h b/av1/common/seg_common.h
index aa7cd68a9a..3ba50526b8 100644
--- a/av1/common/seg_common.h
+++ b/av1/common/seg_common.h
@@ -96,9 +96,9 @@ static inline int get_segdata(const struct segmentation *seg, int segment_id,
   return seg->feature_data[segment_id][feature_id];
 }
 
-static AOM_INLINE void set_segment_id(uint8_t *segment_ids, int mi_offset,
-                                      int x_mis, int y_mis, int mi_stride,
-                                      uint8_t segment_id) {
+static inline void set_segment_id(uint8_t *segment_ids, int mi_offset,
+                                  int x_mis, int y_mis, int mi_stride,
+                                  uint8_t segment_id) {
   segment_ids += mi_offset;
   for (int y = 0; y < y_mis; ++y) {
     memset(&segment_ids[y * mi_stride], segment_id,
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index d25ba08447..7efed9918e 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -364,8 +364,8 @@ void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
       sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane);
 }
 
-static AOM_INLINE void sync_lf_workers(AVxWorker *const workers,
-                                       AV1_COMMON *const cm, int num_workers) {
+static inline void sync_lf_workers(AVxWorker *const workers,
+                                   AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = workers[0].had_error;
   struct aom_internal_error_info error_info;
@@ -891,8 +891,8 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
   return 1;
 }
 
-static AOM_INLINE void sync_lr_workers(AVxWorker *const workers,
-                                       AV1_COMMON *const cm, int num_workers) {
+static inline void sync_lr_workers(AVxWorker *const workers,
+                                   AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = workers[0].had_error;
   struct aom_internal_error_info error_info;
@@ -993,15 +993,15 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
 }
 
 // Initializes cdef_sync parameters.
-static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
+static inline void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
   cdef_sync->end_of_frame = 0;
   cdef_sync->fbr = 0;
   cdef_sync->fbc = 0;
   cdef_sync->cdef_mt_exit = false;
 }
 
-static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
-                                           int num_workers) {
+static inline void launch_cdef_workers(AVxWorker *const workers,
+                                       int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &workers[i];
@@ -1013,9 +1013,8 @@ static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
   }
 }
 
-static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
-                                         AV1_COMMON *const cm,
-                                         int num_workers) {
+static inline void sync_cdef_workers(AVxWorker *const workers,
+                                     AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = workers[0].had_error;
   struct aom_internal_error_info error_info;
@@ -1049,9 +1048,8 @@ static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
 
 // Checks if a job is available. If job is available,
 // populates next job information and returns 1, else returns 0.
-static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
-                                            volatile int *cur_fbr,
-                                            const int nvfb) {
+static inline int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
+                                        volatile int *cur_fbr, const int nvfb) {
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(cdef_sync->mutex_);
 #endif  // CONFIG_MULTITHREAD
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 6a7ff14aae..0c27e12f2e 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -227,10 +227,10 @@ static AOM_FORCE_INLINE bool skip_loop_filter_plane(
   return !planes_to_lf[plane];
 }
 
-static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
-                                       const int planes_to_lf[MAX_MB_PLANE],
-                                       int lpf_opt_level,
-                                       int num_mis_in_lpf_unit_height) {
+static inline void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
+                                   const int planes_to_lf[MAX_MB_PLANE],
+                                   int lpf_opt_level,
+                                   int num_mis_in_lpf_unit_height) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
@@ -257,7 +257,7 @@ static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
   }
 }
 
-static AOM_INLINE void loop_filter_frame_mt_init(
+static inline void loop_filter_frame_mt_init(
     AV1_COMMON *cm, int start_mi_row, int end_mi_row,
     const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync,
     int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
@@ -282,7 +282,7 @@ static AOM_INLINE void loop_filter_frame_mt_init(
                   lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2));
 }
 
-static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+static inline AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
   AV1LfMTInfo *cur_job_info = NULL;
 
 #if CONFIG_MULTITHREAD
@@ -301,10 +301,10 @@ static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
   return cur_job_info;
 }
 
-static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data,
-                                              YV12_BUFFER_CONFIG *frame_buffer,
-                                              struct AV1Common *cm,
-                                              MACROBLOCKD *xd) {
+static inline void loop_filter_data_reset(LFWorkerData *lf_data,
+                                          YV12_BUFFER_CONFIG *frame_buffer,
+                                          struct AV1Common *cm,
+                                          MACROBLOCKD *xd) {
   struct macroblockd_plane *pd = xd->plane;
   lf_data->frame_buffer = frame_buffer;
   lf_data->cm = cm;
@@ -316,10 +316,9 @@ static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data,
   }
 }
 
-static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf,
-                                                 int planes_to_lf[MAX_MB_PLANE],
-                                                 int plane_start,
-                                                 int plane_end) {
+static inline void set_planes_to_loop_filter(const struct loopfilter *lf,
+                                             int planes_to_lf[MAX_MB_PLANE],
+                                             int plane_start, int plane_end) {
   // For each luma and chroma plane, whether to filter it or not.
   planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) &&
                     plane_start <= 0 && 0 < plane_end;
@@ -327,9 +326,9 @@ static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf,
   planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end;
 }
 
-static AOM_INLINE int check_planes_to_loop_filter(
-    const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE],
-    int plane_start, int plane_end) {
+static inline int check_planes_to_loop_filter(const struct loopfilter *lf,
+                                              int planes_to_lf[MAX_MB_PLANE],
+                                              int plane_start, int plane_end) {
   set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
   // If the luma plane is purposely not filtered, neither are the chroma
   // planes.
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 48bb8306cb..98db5ae4da 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -20,7 +20,7 @@
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+static inline void av1_convolve_y_sr_general_avx2(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
   // right shift is F-1 because we are already dividing
@@ -524,7 +524,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
   }
 }
 
-static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+static inline void av1_convolve_x_sr_general_avx2(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index cd87992ae7..925fe47cf5 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -820,7 +820,7 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
   } while (0)
 
 #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
-static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2(
+static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2(
     const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
     int w, int h, const __m256i offset_const) {
   int i = h;
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 95f6543666..ccfaad2ac7 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -99,7 +99,7 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
 }
 
 // Use only_chroma = 1 to only set the chroma planes
-static AOM_INLINE void set_planes_to_neutral_grey(
+static inline void set_planes_to_neutral_grey(
     const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf,
     int only_chroma) {
   if (seq_params->use_highbitdepth) {
@@ -128,9 +128,10 @@ static AOM_INLINE void set_planes_to_neutral_grey(
   }
 }
 
-static AOM_INLINE void loop_restoration_read_sb_coeffs(
-    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
-    int runit_idx);
+static inline void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+                                                   MACROBLOCKD *xd,
+                                                   aom_reader *const r,
+                                                   int plane, int runit_idx);
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
@@ -151,11 +152,10 @@ static REFERENCE_MODE read_frame_reference_mode(
   }
 }
 
-static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb,
-                                               int plane, const TX_TYPE tx_type,
-                                               const TX_SIZE tx_size,
-                                               uint8_t *dst, int stride,
-                                               int reduced_tx_set) {
+static inline void inverse_transform_block(DecoderCodingBlock *dcb, int plane,
+                                           const TX_TYPE tx_type,
+                                           const TX_SIZE tx_size, uint8_t *dst,
+                                           int stride, int reduced_tx_set) {
   tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
   eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
   uint16_t scan_line = eob_data->max_scan_line;
@@ -165,7 +165,7 @@ static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb,
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
-static AOM_INLINE void read_coeffs_tx_intra_block(
+static inline void read_coeffs_tx_intra_block(
     const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
   MB_MODE_INFO *mbmi = dcb->xd.mi[0];
@@ -184,11 +184,11 @@ static AOM_INLINE void read_coeffs_tx_intra_block(
   }
 }
 
-static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
-                                         DecoderCodingBlock *dcb,
-                                         aom_reader *const r, const int plane,
-                                         const int row, const int col,
-                                         const TX_SIZE tx_size) {
+static inline void decode_block_void(const AV1_COMMON *const cm,
+                                     DecoderCodingBlock *dcb,
+                                     aom_reader *const r, const int plane,
+                                     const int row, const int col,
+                                     const TX_SIZE tx_size) {
   (void)cm;
   (void)dcb;
   (void)r;
@@ -198,21 +198,21 @@ static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
   (void)tx_size;
 }
 
-static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
-                                                DecoderCodingBlock *dcb,
-                                                BLOCK_SIZE bsize) {
+static inline void predict_inter_block_void(AV1_COMMON *const cm,
+                                            DecoderCodingBlock *dcb,
+                                            BLOCK_SIZE bsize) {
   (void)cm;
   (void)dcb;
   (void)bsize;
 }
 
-static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm,
-                                                  MACROBLOCKD *const xd) {
+static inline void cfl_store_inter_block_void(AV1_COMMON *const cm,
+                                              MACROBLOCKD *const xd) {
   (void)cm;
   (void)xd;
 }
 
-static AOM_INLINE void predict_and_reconstruct_intra_block(
+static inline void predict_and_reconstruct_intra_block(
     const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
   (void)r;
@@ -240,7 +240,7 @@ static AOM_INLINE void predict_and_reconstruct_intra_block(
   }
 }
 
-static AOM_INLINE void inverse_transform_inter_block(
+static inline void inverse_transform_inter_block(
     const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int blk_row, const int blk_col,
     const TX_SIZE tx_size) {
@@ -272,17 +272,19 @@ static AOM_INLINE void inverse_transform_inter_block(
 #endif
 }
 
-static AOM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb,
-                                             TX_SIZE tx_size, int plane) {
+static inline void set_cb_buffer_offsets(DecoderCodingBlock *dcb,
+                                         TX_SIZE tx_size, int plane) {
   dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
   dcb->txb_offset[plane] =
       dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
 }
 
-static AOM_INLINE void decode_reconstruct_tx(
-    AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
-    MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
-    int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
+static inline void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
+                                         aom_reader *r,
+                                         MB_MODE_INFO *const mbmi, int plane,
+                                         BLOCK_SIZE plane_bsize, int blk_row,
+                                         int blk_col, int block,
+                                         TX_SIZE tx_size, int *eob_total) {
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -333,9 +335,9 @@ static AOM_INLINE void decode_reconstruct_tx(
   }
 }
 
-static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   int bw, int bh, int x_mis, int y_mis) {
+static inline void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
+                               int bh, int x_mis, int y_mis) {
   const int num_planes = av1_num_planes(cm);
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const TileInfo *const tile = &xd->tile;
@@ -367,11 +369,11 @@ static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                        num_planes);
 }
 
-static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
-                                         DecoderCodingBlock *dcb, int mi_row,
-                                         int mi_col, aom_reader *r,
-                                         PARTITION_TYPE partition,
-                                         BLOCK_SIZE bsize) {
+static inline void decode_mbmi_block(AV1Decoder *const pbi,
+                                     DecoderCodingBlock *dcb, int mi_row,
+                                     int mi_col, aom_reader *r,
+                                     PARTITION_TYPE partition,
+                                     BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   const SequenceHeader *const seq_params = cm->seq_params;
   const int bw = mi_size_wide[bsize];
@@ -405,10 +407,10 @@ typedef struct PadBlock {
 } PadBlock;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8,
-                                              int src_stride, uint8_t *dst8,
-                                              int dst_stride, int x, int y,
-                                              int b_w, int b_h, int w, int h) {
+static inline void highbd_build_mc_border(const uint8_t *src8, int src_stride,
+                                          uint8_t *dst8, int dst_stride, int x,
+                                          int y, int b_w, int b_h, int w,
+                                          int h) {
   // Get a pointer to the start of the real data for this row.
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -445,9 +447,9 @@ static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
-                                       uint8_t *dst, int dst_stride, int x,
-                                       int y, int b_w, int b_h, int w, int h) {
+static inline void build_mc_border(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride, int x, int y,
+                                   int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
   const uint8_t *ref_row = src - x - y * src_stride;
 
@@ -555,7 +557,7 @@ static inline void extend_mc_border(const struct scale_factors *const sf,
   }
 }
 
-static AOM_INLINE void dec_calc_subpel_params(
+static inline void dec_calc_subpel_params(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
     const MACROBLOCKD *const xd, int mi_x, int mi_y, uint8_t **pre,
     SubpelParams *subpel_params, int *src_stride, PadBlock *block,
@@ -641,7 +643,7 @@ static AOM_INLINE void dec_calc_subpel_params(
   *src_stride = pre_buf->stride;
 }
 
-static AOM_INLINE void dec_calc_subpel_params_and_extend(
+static inline void dec_calc_subpel_params_and_extend(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
     MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf,
     uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
@@ -671,10 +673,10 @@ static void dec_build_inter_predictors(const AV1_COMMON *cm,
                          mi_y, dcb->mc_buf);
 }
 
-static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
-                                                 DecoderCodingBlock *dcb,
-                                                 int mi_row, int mi_col,
-                                                 BLOCK_SIZE bsize) {
+static inline void dec_build_inter_predictor(const AV1_COMMON *cm,
+                                             DecoderCodingBlock *dcb,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &dcb->xd;
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -726,7 +728,7 @@ static inline void dec_build_prediction_by_above_pred(
   }
 }
 
-static AOM_INLINE void dec_build_prediction_by_above_preds(
+static inline void dec_build_prediction_by_above_preds(
     const AV1_COMMON *cm, DecoderCodingBlock *dcb,
     uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
     int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
@@ -781,7 +783,7 @@ static inline void dec_build_prediction_by_left_pred(
   }
 }
 
-static AOM_INLINE void dec_build_prediction_by_left_preds(
+static inline void dec_build_prediction_by_left_preds(
     const AV1_COMMON *cm, DecoderCodingBlock *dcb,
     uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
     int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
@@ -808,8 +810,8 @@ static AOM_INLINE void dec_build_prediction_by_left_preds(
   xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
-static AOM_INLINE void dec_build_obmc_inter_predictors_sb(
-    const AV1_COMMON *cm, DecoderCodingBlock *dcb) {
+static inline void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                                      DecoderCodingBlock *dcb) {
   const int num_planes = av1_num_planes(cm);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -834,17 +836,17 @@ static AOM_INLINE void dec_build_obmc_inter_predictors_sb(
                                   dst_stride2);
 }
 
-static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm,
-                                             MACROBLOCKD *const xd) {
+static inline void cfl_store_inter_block(AV1_COMMON *const cm,
+                                         MACROBLOCKD *const xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (store_cfl_required(cm, xd)) {
     cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
   }
 }
 
-static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
-                                           DecoderCodingBlock *dcb,
-                                           BLOCK_SIZE bsize) {
+static inline void predict_inter_block(AV1_COMMON *const cm,
+                                       DecoderCodingBlock *dcb,
+                                       BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
@@ -888,8 +890,8 @@ static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
 #endif
 }
 
-static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
-                                                  int plane, aom_reader *r) {
+static inline void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
+                                              aom_reader *r) {
   (void)r;
   Av1ColorMapParam params;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -898,10 +900,9 @@ static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
   xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
 
-static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
-                                                ThreadData *const td,
-                                                aom_reader *r,
-                                                BLOCK_SIZE bsize) {
+static inline void decode_token_recon_block(AV1Decoder *const pbi,
+                                            ThreadData *const td, aom_reader *r,
+                                            BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
@@ -1010,10 +1011,10 @@ static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
   av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
 }
 
-static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
-                                         int tx_w_log2, int tx_h_log2,
-                                         int min_txs, int split_size, int txs,
-                                         int blk_row, int blk_col) {
+static inline void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
+                                     int tx_w_log2, int tx_h_log2, int min_txs,
+                                     int split_size, int txs, int blk_row,
+                                     int blk_col) {
   for (int idy = 0; idy < tx_size_high_unit[split_size];
        idy += tx_size_high_unit[min_txs]) {
     for (int idx = 0; idx < tx_size_wide_unit[split_size];
@@ -1025,10 +1026,9 @@ static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
   }
 }
 
-static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
-                                          TX_SIZE tx_size, int depth,
-                                          int blk_row, int blk_col,
-                                          aom_reader *r) {
+static inline void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                                      TX_SIZE tx_size, int depth, int blk_row,
+                                      int blk_col, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   int is_split = 0;
   const BLOCK_SIZE bsize = mbmi->bsize;
@@ -1124,11 +1124,11 @@ static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
   }
 }
 
-static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
-                                          ThreadData *const td, int mi_row,
-                                          int mi_col, aom_reader *r,
-                                          PARTITION_TYPE partition,
-                                          BLOCK_SIZE bsize) {
+static inline void parse_decode_block(AV1Decoder *const pbi,
+                                      ThreadData *const td, int mi_row,
+                                      int mi_col, aom_reader *r,
+                                      PARTITION_TYPE partition,
+                                      BLOCK_SIZE bsize) {
   DecoderCodingBlock *const dcb = &td->dcb;
   MACROBLOCKD *const xd = &dcb->xd;
   decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
@@ -1183,10 +1183,10 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
   decode_token_recon_block(pbi, td, r, bsize);
 }
 
-static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
-                                                      ThreadData *const td,
-                                                      int mi_row, int mi_col,
-                                                      BLOCK_SIZE bsize) {
+static inline void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+                                                  ThreadData *const td,
+                                                  int mi_row, int mi_col,
+                                                  BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   DecoderCodingBlock *const dcb = &td->dcb;
@@ -1214,10 +1214,9 @@ static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
                        num_planes);
 }
 
-static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td,
-                                    int mi_row, int mi_col, aom_reader *r,
-                                    PARTITION_TYPE partition,
-                                    BLOCK_SIZE bsize) {
+static inline void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                                int mi_row, int mi_col, aom_reader *r,
+                                PARTITION_TYPE partition, BLOCK_SIZE bsize) {
   (void)partition;
   set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
   decode_token_recon_block(pbi, td, r, bsize);
@@ -1253,11 +1252,9 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
-static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
-                                        ThreadData *const td, int mi_row,
-                                        int mi_col, aom_reader *reader,
-                                        BLOCK_SIZE bsize,
-                                        int parse_decode_flag) {
+static inline void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
+                                    int mi_row, int mi_col, aom_reader *reader,
+                                    BLOCK_SIZE bsize, int parse_decode_flag) {
   assert(bsize < BLOCK_SIZES_ALL);
   AV1_COMMON *const cm = &pbi->common;
   DecoderCodingBlock *const dcb = &td->dcb;
@@ -1403,7 +1400,7 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
     update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static AOM_INLINE void setup_bool_decoder(
+static inline void setup_bool_decoder(
     MACROBLOCKD *const xd, const uint8_t *data, const uint8_t *data_end,
     const size_t read_size, struct aom_internal_error_info *error_info,
     aom_reader *r, uint8_t allow_update_cdf) {
@@ -1432,8 +1429,8 @@ static AOM_INLINE void setup_bool_decoder(
   r->allow_update_cdf = allow_update_cdf;
 }
 
-static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
-                                          struct aom_read_bit_buffer *rb) {
+static inline void setup_segmentation(AV1_COMMON *const cm,
+                                      struct aom_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
 
   seg->update_map = 0;
@@ -1507,8 +1504,8 @@ static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
   segfeatures_copy(&cm->cur_frame->seg, seg);
 }
 
-static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
-                                               struct aom_read_bit_buffer *rb) {
+static inline void decode_restoration_mode(AV1_COMMON *cm,
+                                           struct aom_read_bit_buffer *rb) {
   assert(!cm->features.all_lossless);
   const int num_planes = av1_num_planes(cm);
   if (cm->features.allow_intrabc) return;
@@ -1564,10 +1561,9 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void read_wiener_filter(int wiener_win,
-                                          WienerInfo *wiener_info,
-                                          WienerInfo *ref_wiener_info,
-                                          aom_reader *rb) {
+static inline void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
+                                      WienerInfo *ref_wiener_info,
+                                      aom_reader *rb) {
   memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
   memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
 
@@ -1625,9 +1621,9 @@ static AOM_INLINE void read_wiener_filter(int wiener_win,
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
-                                           SgrprojInfo *ref_sgrproj_info,
-                                           aom_reader *rb) {
+static inline void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                       SgrprojInfo *ref_sgrproj_info,
+                                       aom_reader *rb) {
   sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
   const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 
@@ -1662,9 +1658,10 @@ static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
-static AOM_INLINE void loop_restoration_read_sb_coeffs(
-    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
-    int runit_idx) {
+static inline void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+                                                   MACROBLOCKD *xd,
+                                                   aom_reader *const r,
+                                                   int plane, int runit_idx) {
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
   assert(rsi->frame_restoration_type != RESTORE_NONE);
@@ -1705,8 +1702,8 @@ static AOM_INLINE void loop_restoration_read_sb_coeffs(
   }
 }
 
-static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
-                                        struct aom_read_bit_buffer *rb) {
+static inline void setup_loopfilter(AV1_COMMON *cm,
+                                    struct aom_read_bit_buffer *rb) {
   const int num_planes = av1_num_planes(cm);
   struct loopfilter *lf = &cm->lf;
 
@@ -1758,8 +1755,7 @@ static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
   memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
 }
 
-static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
-                                  struct aom_read_bit_buffer *rb) {
+static inline void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   const int num_planes = av1_num_planes(cm);
   CdefInfo *const cdef_info = &cm->cdef_info;
 
@@ -1778,10 +1774,9 @@ static inline int read_delta_q(struct aom_read_bit_buffer *rb) {
   return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
 }
 
-static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
-                                          int num_planes,
-                                          bool separate_uv_delta_q,
-                                          struct aom_read_bit_buffer *rb) {
+static inline void setup_quantization(CommonQuantParams *quant_params,
+                                      int num_planes, bool separate_uv_delta_q,
+                                      struct aom_read_bit_buffer *rb) {
   quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
   quant_params->y_dc_delta_q = read_delta_q(rb);
   if (num_planes > 1) {
@@ -1818,8 +1813,8 @@ static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
 }
 
 // Build y/uv dequant values based on segmentation.
-static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
-                                                  MACROBLOCKD *const xd) {
+static inline void setup_segmentation_dequant(AV1_COMMON *const cm,
+                                              MACROBLOCKD *const xd) {
   const int bit_depth = cm->seq_params->bit_depth;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
@@ -1867,8 +1862,8 @@ static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
                              : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
 }
 
-static AOM_INLINE void setup_render_size(AV1_COMMON *cm,
-                                         struct aom_read_bit_buffer *rb) {
+static inline void setup_render_size(AV1_COMMON *cm,
+                                     struct aom_read_bit_buffer *rb) {
   cm->render_width = cm->superres_upscaled_width;
   cm->render_height = cm->superres_upscaled_height;
   if (aom_rb_read_bit(rb))
@@ -1876,9 +1871,9 @@ static AOM_INLINE void setup_render_size(AV1_COMMON *cm,
 }
 
 // TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
-static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
-                                      struct aom_read_bit_buffer *rb,
-                                      int *width, int *height) {
+static inline void setup_superres(AV1_COMMON *const cm,
+                                  struct aom_read_bit_buffer *rb, int *width,
+                                  int *height) {
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
 
@@ -1899,8 +1894,8 @@ static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
-                                              int height) {
+static inline void resize_context_buffers(AV1_COMMON *cm, int width,
+                                          int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
     aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -1937,7 +1932,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
   cm->cur_frame->height = cm->height;
 }
 
-static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
+static inline void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
   const SequenceHeader *const seq_params = cm->seq_params;
 
@@ -1967,9 +1962,9 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
   cm->cur_frame->buf.render_height = cm->render_height;
 }
 
-static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
-                                        int frame_size_override_flag,
-                                        struct aom_read_bit_buffer *rb) {
+static inline void setup_frame_size(AV1_COMMON *cm,
+                                    int frame_size_override_flag,
+                                    struct aom_read_bit_buffer *rb) {
   const SequenceHeader *const seq_params = cm->seq_params;
   int width, height;
 
@@ -1993,8 +1988,8 @@ static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
   setup_buffer_pool(cm);
 }
 
-static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params,
-                                     struct aom_read_bit_buffer *rb) {
+static inline void setup_sb_size(SequenceHeader *seq_params,
+                                 struct aom_read_bit_buffer *rb) {
   set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
 }
 
@@ -2006,8 +2001,8 @@ static inline int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
          ref_yss == this_yss;
 }
 
-static AOM_INLINE void setup_frame_size_with_refs(
-    AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static inline void setup_frame_size_with_refs(AV1_COMMON *cm,
+                                              struct aom_read_bit_buffer *rb) {
   int width, height;
   int found = 0;
   int has_valid_ref_frame = 0;
@@ -2086,7 +2081,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
     return (v << 1) - m + aom_rb_read_bit(rb);
 }
 
-static AOM_INLINE void read_tile_info_max_tile(
+static inline void read_tile_info_max_tile(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
   const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
@@ -2170,8 +2165,8 @@ void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
   }
 }
 
-static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
-                                      struct aom_read_bit_buffer *const rb) {
+static inline void read_tile_info(AV1Decoder *const pbi,
+                                  struct aom_read_bit_buffer *const rb) {
   AV1_COMMON *const cm = &pbi->common;
 
   read_tile_info_max_tile(cm, rb);
@@ -2191,8 +2186,8 @@ static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
 }
 
 #if EXT_TILE_DEBUG
-static AOM_INLINE void read_ext_tile_info(
-    AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) {
+static inline void read_ext_tile_info(AV1Decoder *const pbi,
+                                      struct aom_read_bit_buffer *const rb) {
   AV1_COMMON *const cm = &pbi->common;
 
   // This information is stored as a separate byte.
@@ -2222,7 +2217,7 @@ static size_t mem_get_varsize(const uint8_t *src, int sz) {
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'. On return, '*data' is updated to point to the end of the
 // raw tile buffer in the bit stream.
-static AOM_INLINE void get_ls_tile_buffer(
+static inline void get_ls_tile_buffer(
     const uint8_t *const data_end, struct aom_internal_error_info *error_info,
     const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
     int tile_size_bytes, int col, int row, int tile_copy_mode) {
@@ -2387,10 +2382,11 @@ static const uint8_t *get_ls_single_tile_buffer(
 
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
-static AOM_INLINE void get_tile_buffer(
-    const uint8_t *const data_end, const int tile_size_bytes, int is_last,
-    struct aom_internal_error_info *error_info, const uint8_t **data,
-    TileBufferDec *const buf) {
+static inline void get_tile_buffer(const uint8_t *const data_end,
+                                   const int tile_size_bytes, int is_last,
+                                   struct aom_internal_error_info *error_info,
+                                   const uint8_t **data,
+                                   TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
@@ -2414,7 +2410,7 @@ static AOM_INLINE void get_tile_buffer(
   *data += size;
 }
 
-static AOM_INLINE void get_tile_buffers(
+static inline void get_tile_buffers(
     AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
     TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile,
     int end_tile) {
@@ -2442,10 +2438,9 @@ static AOM_INLINE void get_tile_buffers(
   }
 }
 
-static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
-                                     CB_BUFFER *cb_buffer_base,
-                                     const int num_planes, int mi_row,
-                                     int mi_col) {
+static inline void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
+                                 CB_BUFFER *cb_buffer_base,
+                                 const int num_planes, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &pbi->common;
   int mib_size_log2 = cm->seq_params->mib_size_log2;
   int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
@@ -2465,8 +2460,7 @@ static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
   xd->color_index_map_offset[1] = 0;
 }
 
-static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi,
-                                               const int n_tiles) {
+static inline void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
   AV1_COMMON *const cm = &pbi->common;
   aom_free(pbi->tile_data);
   pbi->allocated_tiles = 0;
@@ -2499,8 +2493,8 @@ static inline int get_sync_range(int width) {
 }
 
 // Allocate memory for decoder row synchronization
-static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync,
-                                        AV1_COMMON *cm, int rows) {
+static inline void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync,
+                                    AV1_COMMON *cm, int rows) {
   dec_row_mt_sync->allocated_sb_rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -2624,9 +2618,9 @@ static inline void signal_decoding_done_for_erroneous_row(
              sb_cols_in_tile);
 }
 
-static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
-                                          const TileInfo *tile_info,
-                                          const int mi_row) {
+static inline void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                                      const TileInfo *tile_info,
+                                      const int mi_row) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
   TileDataDec *const tile_data = pbi->tile_data +
@@ -2687,8 +2681,8 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
   return 0;
 }
 
-static AOM_INLINE void set_decode_func_pointers(ThreadData *td,
-                                                int parse_decode_flag) {
+static inline void set_decode_func_pointers(ThreadData *td,
+                                            int parse_decode_flag) {
   td->read_coeffs_tx_intra_block_visit = decode_block_void;
   td->predict_and_recon_intra_block_visit = decode_block_void;
   td->read_coeffs_tx_inter_block_visit = decode_block_void;
@@ -2709,8 +2703,8 @@ static AOM_INLINE void set_decode_func_pointers(ThreadData *td,
   }
 }
 
-static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
-                                   int tile_row, int tile_col) {
+static inline void decode_tile(AV1Decoder *pbi, ThreadData *const td,
+                               int tile_row, int tile_col) {
   TileInfo tile_info;
 
   AV1_COMMON *const cm = &pbi->common;
@@ -2915,10 +2909,11 @@ static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
   return cur_job_info;
 }
 
-static AOM_INLINE void tile_worker_hook_init(
-    AV1Decoder *const pbi, DecWorkerData *const thread_data,
-    const TileBufferDec *const tile_buffer, TileDataDec *const tile_data,
-    uint8_t allow_update_cdf) {
+static inline void tile_worker_hook_init(AV1Decoder *const pbi,
+                                         DecWorkerData *const thread_data,
+                                         const TileBufferDec *const tile_buffer,
+                                         TileDataDec *const tile_data,
+                                         uint8_t allow_update_cdf) {
   AV1_COMMON *cm = &pbi->common;
   ThreadData *const td = thread_data->td;
   int tile_row = tile_data->tile_info.tile_row;
@@ -3152,8 +3147,8 @@ static inline void signal_parse_sb_row_done(AV1Decoder *const pbi,
 
 // This function is very similar to decode_tile(). It would be good to figure
 // out how to share code.
-static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
-                                         TileDataDec *const tile_data) {
+static inline void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
+                                     TileDataDec *const tile_data) {
   AV1_COMMON *const cm = &pbi->common;
   const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int num_planes = av1_num_planes(cm);
@@ -3326,10 +3321,10 @@ static int compare_tile_buffers(const void *a, const void *b) {
   return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
 }
 
-static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
-                                         int tile_rows_start, int tile_rows_end,
-                                         int tile_cols_start, int tile_cols_end,
-                                         int start_tile, int end_tile) {
+static inline void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                                     int tile_rows_start, int tile_rows_end,
+                                     int tile_cols_start, int tile_cols_end,
+                                     int start_tile, int end_tile) {
   AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
   TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
   tile_mt_info->jobs_enqueued = 0;
@@ -3348,9 +3343,8 @@ static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info,
-                                      AV1_COMMON *cm, int tile_rows,
-                                      int tile_cols) {
+static inline void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
+                                  int tile_rows, int tile_cols) {
   tile_mt_info->alloc_tile_rows = tile_rows;
   tile_mt_info->alloc_tile_cols = tile_cols;
   int num_tiles = tile_rows * tile_cols;
@@ -3390,9 +3384,9 @@ void av1_free_mc_tmp_buf(ThreadData *thread_data) {
   }
 }
 
-static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
-                                           ThreadData *thread_data,
-                                           int buf_size, int use_highbd) {
+static inline void allocate_mc_tmp_buf(AV1_COMMON *const cm,
+                                       ThreadData *thread_data, int buf_size,
+                                       int use_highbd) {
   for (int ref = 0; ref < 2; ref++) {
     // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
     // 'Conditional jump or move depends on uninitialised value' from the loop
@@ -3428,9 +3422,8 @@ static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
-                                         AVxWorkerHook worker_hook,
-                                         int num_workers) {
+static inline void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
+                                     int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
   // Reset tile decoding hook
@@ -3461,9 +3454,8 @@ static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
 #endif
 }
 
-static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
-                                          const uint8_t *data_end,
-                                          int num_workers) {
+static inline void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
+                                      int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
   for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) {
@@ -3481,7 +3473,7 @@ static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
   }
 }
 
-static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+static inline void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int corrupted = 0;
 
@@ -3493,7 +3485,7 @@ static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
   pbi->dcb.corrupted = corrupted;
 }
 
-static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
+static inline void decode_mt_init(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int worker_idx;
@@ -3542,11 +3534,10 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
   }
 }
 
-static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols,
-                                     int tile_rows, int tile_rows_start,
-                                     int tile_rows_end, int tile_cols_start,
-                                     int tile_cols_end, int start_tile,
-                                     int end_tile) {
+static inline void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
+                                 int tile_rows_start, int tile_rows_end,
+                                 int tile_cols_start, int tile_cols_end,
+                                 int start_tile, int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
   if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
       pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
@@ -3659,7 +3650,7 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   return aom_reader_find_end(&tile_data->bit_reader);
 }
 
-static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
+static inline void dec_alloc_cb_buf(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
              ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
@@ -3673,10 +3664,10 @@ static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
   }
 }
 
-static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
-                                         int tile_rows_end, int tile_cols_start,
-                                         int tile_cols_end, int start_tile,
-                                         int end_tile, int max_sb_rows) {
+static inline void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+                                     int tile_rows_end, int tile_cols_start,
+                                     int tile_cols_end, int start_tile,
+                                     int end_tile, int max_sb_rows) {
   AV1_COMMON *const cm = &pbi->common;
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 
@@ -3870,7 +3861,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
   return aom_reader_find_end(&tile_data->bit_reader);
 }
 
-static AOM_INLINE void error_handler(void *data) {
+static inline void error_handler(void *data) {
   AV1_COMMON *const cm = (AV1_COMMON *)data;
   aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
@@ -3879,9 +3870,9 @@ static AOM_INLINE void error_handler(void *data) {
 // seq_params->bit_depth based on the values of those fields and
 // seq_params->profile. Reports errors by calling rb->error_handler() or
 // aom_internal_error().
-static AOM_INLINE void read_bitdepth(
-    struct aom_read_bit_buffer *rb, SequenceHeader *seq_params,
-    struct aom_internal_error_info *error_info) {
+static inline void read_bitdepth(struct aom_read_bit_buffer *rb,
+                                 SequenceHeader *seq_params,
+                                 struct aom_internal_error_info *error_info) {
   const int high_bitdepth = aom_rb_read_bit(rb);
   if (seq_params->profile == PROFILE_2 && high_bitdepth) {
     const int twelve_bit = aom_rb_read_bit(rb);
@@ -4065,8 +4056,8 @@ static void read_film_grain_params(AV1_COMMON *cm,
   pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 }
 
-static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
-                                       struct aom_read_bit_buffer *rb) {
+static inline void read_film_grain(AV1_COMMON *cm,
+                                   struct aom_read_bit_buffer *rb) {
   if (cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     read_film_grain_params(cm, rb);
@@ -4204,8 +4195,8 @@ void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
   op_params->low_delay_mode_flag = aom_rb_read_bit(rb);
 }
 
-static AOM_INLINE void read_temporal_point_info(
-    AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
+static inline void read_temporal_point_info(AV1_COMMON *const cm,
+                                            struct aom_read_bit_buffer *rb) {
   cm->frame_presentation_time = aom_rb_read_unsigned_literal(
       rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
@@ -4364,8 +4355,8 @@ static int read_global_motion_params(WarpedMotionParams *params,
   return 1;
 }
 
-static AOM_INLINE void read_global_motion(AV1_COMMON *cm,
-                                          struct aom_read_bit_buffer *rb) {
+static inline void read_global_motion(AV1_COMMON *cm,
+                                      struct aom_read_bit_buffer *rb) {
   for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
         cm->prev_frame ? &cm->prev_frame->global_motion[frame]
@@ -4409,7 +4400,7 @@ static AOM_INLINE void read_global_motion(AV1_COMMON *cm,
 
 // Release the references to the frame buffers in cm->ref_frame_map and reset
 // all elements of cm->ref_frame_map to NULL.
-static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) {
+static inline void reset_ref_frame_map(AV1_COMMON *const cm) {
   BufferPool *const pool = cm->buffer_pool;
 
   for (int i = 0; i < REF_FRAMES; i++) {
@@ -4420,7 +4411,7 @@ static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) {
 
 // If the refresh_frame_flags bitmask is set, update reference frame id values
 // and mark frames as valid for reference.
-static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) {
+static inline void update_ref_frame_id(AV1Decoder *const pbi) {
   AV1_COMMON *const cm = &pbi->common;
   int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
   for (int i = 0; i < REF_FRAMES; i++) {
@@ -4431,8 +4422,8 @@ static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) {
   }
 }
 
-static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi,
-                                                 int existing_frame_idx) {
+static inline void show_existing_frame_reset(AV1Decoder *const pbi,
+                                             int existing_frame_idx) {
   AV1_COMMON *const cm = &pbi->common;
 
   assert(cm->show_existing_frame);
@@ -5152,7 +5143,7 @@ BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
   return (BITSTREAM_PROFILE)profile;
 }
 
-static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
+static inline void superres_post_decode(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
@@ -5243,7 +5234,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 }
 
 // Once-per-frame initialization
-static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
+static inline void setup_frame_info(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
 
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
diff --git a/av1/encoder/arm/shift_neon.h b/av1/encoder/arm/shift_neon.h
index a6fdbebc17..b0b9609397 100644
--- a/av1/encoder/arm/shift_neon.h
+++ b/av1/encoder/arm/shift_neon.h
@@ -14,14 +14,12 @@
 
 #include <arm_neon.h>
 
-#include "aom/aom_integer.h"  // For AOM_INLINE.
-
-#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg)                \
-  static AOM_INLINE void name(const type *in, type *out, int size) { \
-    int i = 0;                                                       \
-    do {                                                             \
-      out[i] = intrinsic(in[i], arg);                                \
-    } while (++i < size);                                            \
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg)            \
+  static inline void name(const type *in, type *out, int size) { \
+    int i = 0;                                                   \
+    do {                                                         \
+      out[i] = intrinsic(in[i], arg);                            \
+    } while (++i < size);                                        \
   }
 
 SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
diff --git a/av1/encoder/arm/txfm_neon.h b/av1/encoder/arm/txfm_neon.h
index 49ff87dc24..63bf0dd5b8 100644
--- a/av1/encoder/arm/txfm_neon.h
+++ b/av1/encoder/arm/txfm_neon.h
@@ -12,11 +12,11 @@
 #ifndef AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
 #define AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
 
-#include "aom/aom_integer.h"  // For AOM_INLINE.
+#include <stdint.h>
 
-static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
-                                                  const int16_t **input,
-                                                  int *stride, int out_size) {
+static inline void ud_adjust_input_and_stride(int ud_flip,
+                                              const int16_t **input,
+                                              int *stride, int out_size) {
   if (ud_flip) {
     *input = *input + (out_size - 1) * *stride;
     *stride = -*stride;
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 8d4dd53de2..4b9f3f7d7b 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -66,26 +66,25 @@ static inline void write_uniform(aom_writer *w, int n, int v) {
 }
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void loop_restoration_write_sb_coeffs(
+static inline void loop_restoration_write_sb_coeffs(
     const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
     aom_writer *const w, int plane, FRAME_COUNTS *counts);
 #endif
 
-static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
-                                             const MB_MODE_INFO *mi,
-                                             const MB_MODE_INFO *above_mi,
-                                             const MB_MODE_INFO *left_mi,
-                                             PREDICTION_MODE mode,
-                                             aom_writer *w) {
+static inline void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                         const MB_MODE_INFO *mi,
+                                         const MB_MODE_INFO *above_mi,
+                                         const MB_MODE_INFO *left_mi,
+                                         PREDICTION_MODE mode, aom_writer *w) {
   assert(!is_intrabc_block(mi));
   (void)mi;
   aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
                    INTRA_MODES);
 }
 
-static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
-                                        FRAME_CONTEXT *ec_ctx,
-                                        const int16_t mode_ctx) {
+static inline void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                                    FRAME_CONTEXT *ec_ctx,
+                                    const int16_t mode_ctx) {
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
 
   aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
@@ -102,9 +101,10 @@ static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
   }
 }
 
-static AOM_INLINE void write_drl_idx(
-    FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
-    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+static inline void write_drl_idx(FRAME_CONTEXT *ec_ctx,
+                                 const MB_MODE_INFO *mbmi,
+                                 const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+                                 aom_writer *w) {
   assert(mbmi->ref_mv_idx < 3);
 
   const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
@@ -137,20 +137,19 @@ static AOM_INLINE void write_drl_idx(
   }
 }
 
-static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
-                                                 PREDICTION_MODE mode,
-                                                 const int16_t mode_ctx) {
+static inline void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                             PREDICTION_MODE mode,
+                                             const int16_t mode_ctx) {
   assert(is_inter_compound_mode(mode));
   aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
                    xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
                    INTER_COMPOUND_MODES);
 }
 
-static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
-                                           const MB_MODE_INFO *mbmi,
-                                           TX_SIZE tx_size, int depth,
-                                           int blk_row, int blk_col,
-                                           aom_writer *w) {
+static inline void write_tx_size_vartx(MACROBLOCKD *xd,
+                                       const MB_MODE_INFO *mbmi,
+                                       TX_SIZE tx_size, int depth, int blk_row,
+                                       int blk_col, aom_writer *w) {
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
@@ -200,8 +199,8 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
-                                              aom_writer *w) {
+static inline void write_selected_tx_size(const MACROBLOCKD *xd,
+                                          aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -260,9 +259,9 @@ static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   return skip_mode;
 }
 
-static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
-                                      const MACROBLOCKD *xd, uint8_t segment_id,
-                                      aom_writer *w, const int is_inter) {
+static inline void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                  uint8_t segment_id, aom_writer *w,
+                                  const int is_inter) {
   if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
       assert(is_inter);
@@ -274,9 +273,8 @@ static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         const MB_MODE_INFO *mbmi,
-                                         aom_writer *w) {
+static inline void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi, aom_writer *w) {
   MOTION_MODE last_motion_mode_allowed =
       cm->features.switchable_motion_mode
           ? motion_mode_allowed(cm->global_motion, xd, mbmi,
@@ -296,8 +294,8 @@ static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
-                                          int delta_qindex, aom_writer *w) {
+static inline void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
+                                      aom_writer *w) {
   int sign = delta_qindex < 0;
   int abs = sign ? -delta_qindex : delta_qindex;
   int rem_bits, thr;
@@ -318,10 +316,10 @@ static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
-                                           const MACROBLOCKD *xd, int lf_id,
-                                           int delta_lflevel,
-                                           int delta_lf_multi, aom_writer *w) {
+static inline void write_delta_lflevel(const AV1_COMMON *cm,
+                                       const MACROBLOCKD *xd, int lf_id,
+                                       int delta_lflevel, int delta_lf_multi,
+                                       aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
@@ -350,8 +348,8 @@ static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
-                                       int n, int num, MapCdf map_pb_cdf) {
+static inline void pack_map_tokens(aom_writer *w, const TokenExtra **tp, int n,
+                                   int num, MapCdf map_pb_cdf) {
   const TokenExtra *p = *tp;
   const int palette_size_idx = n - PALETTE_MIN_SIZE;
   write_uniform(w, n, p->token);  // The first color index.
@@ -367,7 +365,7 @@ static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
   *tp = p;
 }
 
-static AOM_INLINE void pack_txb_tokens(
+static inline void pack_txb_tokens(
     aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
     const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
     int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
@@ -454,12 +452,12 @@ int av1_neg_interleave(int x, int ref, int max) {
   }
 }
 
-static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
-                                        const MB_MODE_INFO *const mbmi,
-                                        aom_writer *w,
-                                        const struct segmentation *seg,
-                                        struct segmentation_probs *segp,
-                                        int skip_txfm) {
+static inline void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                                    const MB_MODE_INFO *const mbmi,
+                                    aom_writer *w,
+                                    const struct segmentation *seg,
+                                    struct segmentation_probs *segp,
+                                    int skip_txfm) {
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
@@ -496,8 +494,8 @@ static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
   aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
 
 // This function encodes the reference frame
-static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd, aom_writer *w) {
+static inline void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                    aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   const uint8_t segment_id = mbmi->segment_id;
@@ -600,9 +598,10 @@ static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void write_filter_intra_mode_info(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
-    aom_writer *w) {
+static inline void write_filter_intra_mode_info(const AV1_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *const mbmi,
+                                                aom_writer *w) {
   if (av1_filter_intra_allowed(cm, mbmi)) {
     aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
                      xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
@@ -615,14 +614,14 @@ static AOM_INLINE void write_filter_intra_mode_info(
   }
 }
 
-static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
-                                         aom_cdf_prob *cdf) {
+static inline void write_angle_delta(aom_writer *w, int angle_delta,
+                                     aom_cdf_prob *cdf) {
   aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
                    2 * MAX_ANGLE_DELTA + 1);
 }
 
-static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
-                                              ThreadData *td, aom_writer *w) {
+static inline void write_mb_interp_filter(AV1_COMMON *const cm, ThreadData *td,
+                                          aom_writer *w) {
   const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -651,9 +650,9 @@ static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
 // Transmit color values with delta encoding. Write the first value as
 // literal, and the deltas between each value and the previous one. "min_val" is
 // the smallest possible value of the deltas.
-static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
-                                                   int bit_depth, int min_val,
-                                                   aom_writer *w) {
+static inline void delta_encode_palette_colors(const int *colors, int num,
+                                               int bit_depth, int min_val,
+                                               aom_writer *w) {
   if (num <= 0) return;
   assert(colors[0] < (1 << bit_depth));
   aom_write_literal(w, colors[0], bit_depth);
@@ -683,9 +682,9 @@ static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
 // Transmit luma palette color values. First signal if each color in the color
 // cache is used. Those colors that are not in the cache are transmitted with
 // delta encoding.
-static AOM_INLINE void write_palette_colors_y(
-    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
-    int bit_depth, aom_writer *w) {
+static inline void write_palette_colors_y(const MACROBLOCKD *const xd,
+                                          const PALETTE_MODE_INFO *const pmi,
+                                          int bit_depth, aom_writer *w) {
   const int n = pmi->palette_size[0];
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
@@ -707,9 +706,9 @@ static AOM_INLINE void write_palette_colors_y(
 // Write chroma palette color values. U channel is handled similarly to the luma
 // channel. For v channel, either use delta encoding or transmit raw values
 // directly, whichever costs less.
-static AOM_INLINE void write_palette_colors_uv(
-    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
-    int bit_depth, aom_writer *w) {
+static inline void write_palette_colors_uv(const MACROBLOCKD *const xd,
+                                           const PALETTE_MODE_INFO *const pmi,
+                                           int bit_depth, aom_writer *w) {
   const int n = pmi->palette_size[1];
   const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
   const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
@@ -766,10 +765,10 @@ static AOM_INLINE void write_palette_colors_uv(
   }
 }
 
-static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
-                                               const MACROBLOCKD *xd,
-                                               const MB_MODE_INFO *const mbmi,
-                                               aom_writer *w) {
+static inline void write_palette_mode_info(const AV1_COMMON *cm,
+                                           const MACROBLOCKD *xd,
+                                           const MB_MODE_INFO *const mbmi,
+                                           aom_writer *w) {
   const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize = mbmi->bsize;
   assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
@@ -845,26 +844,25 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
-                                                BLOCK_SIZE bsize,
-                                                PREDICTION_MODE mode,
-                                                aom_writer *w) {
+static inline void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+                                            BLOCK_SIZE bsize,
+                                            PREDICTION_MODE mode,
+                                            aom_writer *w) {
   aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
                    INTRA_MODES);
 }
 
-static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
-                                           UV_PREDICTION_MODE uv_mode,
-                                           PREDICTION_MODE y_mode,
-                                           CFL_ALLOWED_TYPE cfl_allowed,
-                                           aom_writer *w) {
+static inline void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                       UV_PREDICTION_MODE uv_mode,
+                                       PREDICTION_MODE y_mode,
+                                       CFL_ALLOWED_TYPE cfl_allowed,
+                                       aom_writer *w) {
   aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
                    UV_INTRA_MODES - !cfl_allowed);
 }
 
-static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
-                                        uint8_t idx, int8_t joint_sign,
-                                        aom_writer *w) {
+static inline void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, uint8_t idx,
+                                    int8_t joint_sign, aom_writer *w) {
   aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
   // Magnitudes are only signaled for nonzero codes.
   if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
@@ -877,8 +875,8 @@ static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
   }
 }
 
-static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                  aom_writer *w, int skip) {
+static inline void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                              aom_writer *w, int skip) {
   if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
 
   // At the start of a superblock, mark that we haven't yet written CDEF
@@ -917,10 +915,11 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
   }
 }
 
-static AOM_INLINE void write_inter_segment_id(
-    AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
-    const struct segmentation *const seg, struct segmentation_probs *const segp,
-    int skip, int preskip) {
+static inline void write_inter_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                                          aom_writer *w,
+                                          const struct segmentation *const seg,
+                                          struct segmentation_probs *const segp,
+                                          int skip, int preskip) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
   const int mi_row = xd->mi_row;
@@ -956,9 +955,9 @@ static AOM_INLINE void write_inter_segment_id(
 
 // If delta q is present, writes delta_q index.
 // Also writes delta_q loop filter levels, if present.
-static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
-                                            MACROBLOCKD *const xd, int skip,
-                                            aom_writer *w) {
+static inline void write_delta_q_params(AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, int skip,
+                                        aom_writer *w) {
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
   if (delta_q_info->delta_q_present_flag) {
@@ -999,10 +998,10 @@ static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
-                                                    MACROBLOCKD *const xd,
-                                                    int is_keyframe,
-                                                    aom_writer *w) {
+static inline void write_intra_prediction_modes(const AV1_COMMON *cm,
+                                                MACROBLOCKD *const xd,
+                                                int is_keyframe,
+                                                aom_writer *w) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PREDICTION_MODE mode = mbmi->mode;
@@ -1088,8 +1087,8 @@ static inline int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
                                x->mbmi_ext_frame);
 }
 
-static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
-                                           aom_writer *w) {
+static inline void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+                                       aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1247,7 +1246,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
   }
 }
 
-static AOM_INLINE void write_intrabc_info(
+static inline void write_intrabc_info(
     MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
     aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -1263,7 +1262,7 @@ static AOM_INLINE void write_intrabc_info(
   }
 }
 
-static AOM_INLINE void write_mb_modes_kf(
+static inline void write_mb_modes_kf(
     AV1_COMP *cpi, MACROBLOCKD *xd,
     const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
@@ -1293,7 +1292,7 @@ static AOM_INLINE void write_mb_modes_kf(
 }
 
 #if CONFIG_RD_DEBUG
-static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
+static inline void dump_mode_info(MB_MODE_INFO *mi) {
   printf("\nmi->mi_row == %d\n", mi->mi_row);
   printf("&& mi->mi_col == %d\n", mi->mi_col);
   printf("&& mi->bsize == %d\n", mi->bsize);
@@ -1313,7 +1312,7 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
 #endif
 
 #if ENC_MISMATCH_DEBUG
-static AOM_INLINE void enc_dump_logs(
+static inline void enc_dump_logs(
     const AV1_COMMON *const cm,
     const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
   const MB_MODE_INFO *const mbmi = *(
@@ -1369,8 +1368,8 @@ static AOM_INLINE void enc_dump_logs(
 }
 #endif  // ENC_MISMATCH_DEBUG
 
-static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
-                                    aom_writer *w) {
+static inline void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+                                aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *m = xd->mi[0];
@@ -1391,7 +1390,7 @@ static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
   }
 }
 
-static AOM_INLINE void write_inter_txb_coeff(
+static inline void write_inter_txb_coeff(
     AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
     aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
     TOKEN_STATS *token_stats, const int row, const int col, int *block,
@@ -1427,9 +1426,9 @@ static AOM_INLINE void write_inter_txb_coeff(
   }
 }
 
-static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
-                                      aom_writer *w, const TokenExtra **tok,
-                                      const TokenExtra *const tok_end) {
+static inline void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+                                  aom_writer *w, const TokenExtra **tok,
+                                  const TokenExtra *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -1480,11 +1479,11 @@ static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
   }
 }
 
-static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
-                                     const TileInfo *const tile, aom_writer *w,
-                                     const TokenExtra **tok,
-                                     const TokenExtra *const tok_end,
-                                     int mi_row, int mi_col) {
+static inline void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+                                 const TileInfo *const tile, aom_writer *w,
+                                 const TokenExtra **tok,
+                                 const TokenExtra *const tok_end, int mi_row,
+                                 int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -1568,10 +1567,10 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
   }
 }
 
-static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
-                                       const MACROBLOCKD *const xd, int hbs,
-                                       int mi_row, int mi_col, PARTITION_TYPE p,
-                                       BLOCK_SIZE bsize, aom_writer *w) {
+static inline void write_partition(const AV1_COMMON *const cm,
+                                   const MACROBLOCKD *const xd, int hbs,
+                                   int mi_row, int mi_col, PARTITION_TYPE p,
+                                   BLOCK_SIZE bsize, aom_writer *w) {
   const int is_partition_point = bsize >= BLOCK_8X8;
 
   if (!is_partition_point) return;
@@ -1605,10 +1604,11 @@ static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void write_modes_sb(
-    AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
-    aom_writer *const w, const TokenExtra **tok,
-    const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static inline void write_modes_sb(AV1_COMP *const cpi, ThreadData *const td,
+                                  const TileInfo *const tile,
+                                  aom_writer *const w, const TokenExtra **tok,
+                                  const TokenExtra *const tok_end, int mi_row,
+                                  int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
@@ -1711,11 +1711,11 @@ static AOM_INLINE void write_modes_sb(
 }
 
 // Populate token pointers appropriately based on token_info.
-static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
-                                          const int tile_row, int tile_col,
-                                          const int sb_row_in_tile,
-                                          const TokenExtra **tok,
-                                          const TokenExtra **tok_end) {
+static inline void get_token_pointers(const TokenInfo *token_info,
+                                      const int tile_row, int tile_col,
+                                      const int sb_row_in_tile,
+                                      const TokenExtra **tok,
+                                      const TokenExtra **tok_end) {
   if (!is_token_info_allocated(token_info)) {
     *tok = NULL;
     *tok_end = NULL;
@@ -1726,10 +1726,9 @@ static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
       *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count;
 }
 
-static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
-                                   const TileInfo *const tile,
-                                   aom_writer *const w, int tile_row,
-                                   int tile_col) {
+static inline void write_modes(AV1_COMP *const cpi, ThreadData *const td,
+                               const TileInfo *const tile, aom_writer *const w,
+                               int tile_row, int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
@@ -1770,8 +1769,8 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
   }
 }
 
-static AOM_INLINE void encode_restoration_mode(
-    AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+static inline void encode_restoration_mode(AV1_COMMON *cm,
+                                           struct aom_write_bit_buffer *wb) {
   assert(!cm->features.all_lossless);
   if (!cm->seq_params->enable_restoration) return;
   if (cm->features.allow_intrabc) return;
@@ -1843,10 +1842,10 @@ static AOM_INLINE void encode_restoration_mode(
 }
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void write_wiener_filter(int wiener_win,
-                                           const WienerInfo *wiener_info,
-                                           WienerInfo *ref_wiener_info,
-                                           aom_writer *wb) {
+static inline void write_wiener_filter(int wiener_win,
+                                       const WienerInfo *wiener_info,
+                                       WienerInfo *ref_wiener_info,
+                                       aom_writer *wb) {
   if (wiener_win == WIENER_WIN)
     aom_write_primitive_refsubexpfin(
         wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
@@ -1888,9 +1887,9 @@ static AOM_INLINE void write_wiener_filter(int wiener_win,
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
-                                            SgrprojInfo *ref_sgrproj_info,
-                                            aom_writer *wb) {
+static inline void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+                                        SgrprojInfo *ref_sgrproj_info,
+                                        aom_writer *wb) {
   aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
   const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 
@@ -1919,7 +1918,7 @@ static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
-static AOM_INLINE void loop_restoration_write_sb_coeffs(
+static inline void loop_restoration_write_sb_coeffs(
     const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
     aom_writer *const w, int plane, FRAME_COUNTS *counts) {
   const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
@@ -2026,8 +2025,8 @@ static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
   return false;
 }
 
-static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
-                                         struct aom_write_bit_buffer *wb) {
+static inline void encode_loopfilter(AV1_COMMON *cm,
+                                     struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
@@ -2078,8 +2077,8 @@ static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
-                                   struct aom_write_bit_buffer *wb) {
+static inline void encode_cdef(const AV1_COMMON *cm,
+                               struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
   if (!cm->seq_params->enable_cdef) return;
   if (cm->features.allow_intrabc) return;
@@ -2096,8 +2095,7 @@ static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
-                                     int delta_q) {
+static inline void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
     aom_wb_write_bit(wb, 1);
     aom_wb_write_inv_signed_literal(wb, delta_q, 6);
@@ -2106,7 +2104,7 @@ static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
   }
 }
 
-static AOM_INLINE void encode_quantization(
+static inline void encode_quantization(
     const CommonQuantParams *const quant_params, int num_planes,
     bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
@@ -2134,8 +2132,8 @@ static AOM_INLINE void encode_quantization(
   }
 }
 
-static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
-                                           struct aom_write_bit_buffer *wb) {
+static inline void encode_segmentation(AV1_COMMON *cm,
+                                       struct aom_write_bit_buffer *wb) {
   int i, j;
   struct segmentation *seg = &cm->seg;
 
@@ -2172,16 +2170,16 @@ static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void write_frame_interp_filter(
-    InterpFilter filter, struct aom_write_bit_buffer *wb) {
+static inline void write_frame_interp_filter(InterpFilter filter,
+                                             struct aom_write_bit_buffer *wb) {
   aom_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
     aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
 }
 
 // Same function as write_uniform but writing to uncompresses header wb
-static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
-                                        int v) {
+static inline void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+                                    int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return;
@@ -2193,8 +2191,8 @@ static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
   }
 }
 
-static AOM_INLINE void write_tile_info_max_tile(
-    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+static inline void write_tile_info_max_tile(const AV1_COMMON *const cm,
+                                            struct aom_write_bit_buffer *wb) {
   int width_sb =
       CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
   int height_sb =
@@ -2242,9 +2240,9 @@ static AOM_INLINE void write_tile_info_max_tile(
   }
 }
 
-static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
-                                       struct aom_write_bit_buffer *saved_wb,
-                                       struct aom_write_bit_buffer *wb) {
+static inline void write_tile_info(const AV1_COMMON *const cm,
+                                   struct aom_write_bit_buffer *saved_wb,
+                                   struct aom_write_bit_buffer *wb) {
   write_tile_info_max_tile(cm, wb);
 
   *saved_wb = *wb;
@@ -2256,9 +2254,9 @@ static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void write_ext_tile_info(
-    const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
-    struct aom_write_bit_buffer *wb) {
+static inline void write_ext_tile_info(const AV1_COMMON *const cm,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       struct aom_write_bit_buffer *wb) {
   // This information is stored as a separate byte.
   int mod = wb->bit_offset % CHAR_BIT;
   if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
@@ -2326,8 +2324,8 @@ static inline int find_identical_tile(
   return 0;
 }
 
-static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
-                                         struct aom_write_bit_buffer *wb) {
+static inline void write_render_size(const AV1_COMMON *cm,
+                                     struct aom_write_bit_buffer *wb) {
   const int scaling_active = av1_resize_scaled(cm);
   aom_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
@@ -2336,8 +2334,8 @@ static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
-                                            struct aom_write_bit_buffer *wb) {
+static inline void write_superres_scale(const AV1_COMMON *const cm,
+                                        struct aom_write_bit_buffer *wb) {
   const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) {
     assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
@@ -2358,9 +2356,9 @@ static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
   }
 }
 
-static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
-                                        int frame_size_override,
-                                        struct aom_write_bit_buffer *wb) {
+static inline void write_frame_size(const AV1_COMMON *cm,
+                                    int frame_size_override,
+                                    struct aom_write_bit_buffer *wb) {
   const int coded_width = cm->superres_upscaled_width - 1;
   const int coded_height = cm->superres_upscaled_height - 1;
 
@@ -2376,8 +2374,8 @@ static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
   write_render_size(cm, wb);
 }
 
-static AOM_INLINE void write_frame_size_with_refs(
-    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+static inline void write_frame_size_with_refs(const AV1_COMMON *const cm,
+                                              struct aom_write_bit_buffer *wb) {
   int found = 0;
 
   MV_REFERENCE_FRAME ref_frame;
@@ -2403,14 +2401,14 @@ static AOM_INLINE void write_frame_size_with_refs(
   }
 }
 
-static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
-                                     struct aom_write_bit_buffer *wb) {
+static inline void write_profile(BITSTREAM_PROFILE profile,
+                                 struct aom_write_bit_buffer *wb) {
   assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
   aom_wb_write_literal(wb, profile, PROFILE_BITS);
 }
 
-static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
-                                      struct aom_write_bit_buffer *wb) {
+static inline void write_bitdepth(const SequenceHeader *const seq_params,
+                                  struct aom_write_bit_buffer *wb) {
   // Profile 0/1: [0] for 8 bit, [1]  10-bit
   // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
   aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
@@ -2419,8 +2417,8 @@ static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
   }
 }
 
-static AOM_INLINE void write_color_config(
-    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+static inline void write_color_config(const SequenceHeader *const seq_params,
+                                      struct aom_write_bit_buffer *wb) {
   write_bitdepth(seq_params, wb);
   const int is_monochrome = seq_params->monochrome;
   // monochrome bit
@@ -2485,7 +2483,7 @@ static AOM_INLINE void write_color_config(
   aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
 }
 
-static AOM_INLINE void write_timing_info_header(
+static inline void write_timing_info_header(
     const aom_timing_info_t *const timing_info,
     struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
@@ -2496,7 +2494,7 @@ static AOM_INLINE void write_timing_info_header(
   }
 }
 
-static AOM_INLINE void write_decoder_model_info(
+static inline void write_decoder_model_info(
     const aom_dec_model_info_t *const decoder_model_info,
     struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(
@@ -2509,7 +2507,7 @@ static AOM_INLINE void write_decoder_model_info(
       wb, decoder_model_info->frame_presentation_time_length - 1, 5);
 }
 
-static AOM_INLINE void write_dec_model_op_parameters(
+static inline void write_dec_model_op_parameters(
     const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
     struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
@@ -2519,15 +2517,15 @@ static AOM_INLINE void write_dec_model_op_parameters(
   aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
 }
 
-static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
-                                         struct aom_write_bit_buffer *wb) {
+static inline void write_tu_pts_info(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
       wb, cm->frame_presentation_time,
       cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
-static AOM_INLINE void write_film_grain_params(
-    const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
+static inline void write_film_grain_params(const AV1_COMP *const cpi,
+                                           struct aom_write_bit_buffer *wb) {
   const AV1_COMMON *const cm = &cpi->common;
   const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
   aom_wb_write_bit(wb, pars->apply_grain);
@@ -2630,8 +2628,8 @@ static AOM_INLINE void write_film_grain_params(
   aom_wb_write_bit(wb, pars->clip_to_restricted_range);
 }
 
-static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
-                                     struct aom_write_bit_buffer *wb) {
+static inline void write_sb_size(const SequenceHeader *const seq_params,
+                                 struct aom_write_bit_buffer *wb) {
   (void)seq_params;
   (void)wb;
   assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
@@ -2641,8 +2639,8 @@ static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
   aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-static AOM_INLINE void write_sequence_header(
-    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+static inline void write_sequence_header(const SequenceHeader *const seq_params,
+                                         struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
   aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
   aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
@@ -2707,7 +2705,7 @@ static AOM_INLINE void write_sequence_header(
   aom_wb_write_bit(wb, seq_params->enable_restoration);
 }
 
-static AOM_INLINE void write_global_motion_params(
+static inline void write_global_motion_params(
     const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
     struct aom_write_bit_buffer *wb, int allow_hp) {
   const TransformationType type = params->wmtype;
@@ -2766,8 +2764,8 @@ static AOM_INLINE void write_global_motion_params(
   }
 }
 
-static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
-                                           struct aom_write_bit_buffer *wb) {
+static inline void write_global_motion(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   int frame;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
@@ -2894,7 +2892,7 @@ static int check_frame_refs_short_signaling(AV1_COMMON *const cm,
 }
 
 // New function based on HLS R18
-static AOM_INLINE void write_uncompressed_header_obu(
+static inline void write_uncompressed_header_obu(
     AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
     struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3239,8 +3237,8 @@ static int choose_size_bytes(uint32_t size, int spare_msbs) {
     return 1;
 }
 
-static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
-                                       const int val) {
+static inline void mem_put_varsize(uint8_t *const dst, const int sz,
+                                   const int val) {
   switch (sz) {
     case 1: dst[0] = (uint8_t)(val & 0xff); break;
     case 2: mem_put_le16(dst, val); break;
@@ -3430,7 +3428,7 @@ static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
   return length_field_size;
 }
 
-static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+static inline void add_trailing_bits(struct aom_write_bit_buffer *wb) {
   if (aom_wb_is_byte_aligned(wb)) {
     aom_wb_write_literal(wb, 0x80, 8);
   } else {
@@ -3439,8 +3437,8 @@ static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
   }
 }
 
-static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
-                                             struct aom_write_bit_buffer *wb) {
+static inline void write_bitstream_level(AV1_LEVEL seq_level_idx,
+                                         struct aom_write_bit_buffer *wb) {
   assert(is_valid_seq_level_idx(seq_level_idx));
   aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 5e6b923f64..ae4d2ed9ec 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -428,7 +428,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   return rd;
 }
 
-static AOM_INLINE void get_inter_predictors_masked_compound(
+static inline void get_inter_predictors_masked_compound(
     MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
     int16_t *residual1, int16_t *diff10, int *strides) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -506,9 +506,9 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
 }
 
 // Computes the rd_threshold for smooth interintra rd search.
-static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
-                                            int total_mode_rate,
-                                            int64_t ref_best_rd) {
+static inline int64_t compute_rd_thresh(MACROBLOCK *const x,
+                                        int total_mode_rate,
+                                        int64_t ref_best_rd) {
   const int64_t rd_thresh = get_rd_thresh_from_best_rd(
       ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
       INTER_INTRA_RD_THRESH_SCALE);
@@ -517,7 +517,7 @@ static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
 }
 
 // Computes the best wedge interintra mode
-static AOM_INLINE int64_t compute_best_wedge_interintra(
+static inline int64_t compute_best_wedge_interintra(
     const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
     MACROBLOCK *const x, const int *const interintra_mode_cost,
     const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index 83e5b59fad..6b6d851106 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -123,8 +123,8 @@ static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
   BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
 };
 
-static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
-                                            int stat_generation_stage) {
+static inline int av1_get_pc_tree_nodes(const int is_sb_size_128,
+                                        int stat_generation_stage) {
   const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
   const int tree_nodes =
       stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index 2b72edcefa..79615bec75 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h
@@ -94,7 +94,7 @@ int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
                                const COMPRESSOR_STAGE compressor_stage);
 
-static AOM_INLINE int is_frame_droppable(
+static inline int is_frame_droppable(
     const RTC_REF *const rtc_ref,
     const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
   // Droppable frame is only used by external refresh flags. VoD setting won't
@@ -111,7 +111,7 @@ static AOM_INLINE int is_frame_droppable(
     return 0;
 }
 
-static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+static inline int get_current_frame_ref_type(const AV1_COMP *const cpi) {
   // We choose the reference "type" of this frame from the flags which indicate
   // which reference frames will be refreshed by it. More than one of these
   // flags may be set, so the order here implies an order of precedence. This is
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 80aa2d125c..1c0a8fba62 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -239,10 +239,10 @@ void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
  * \remark No return value but updates macroblock and thread data
  * related to the q / q delta to be used.
  */
-static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
-                                     MACROBLOCK *const x,
-                                     const TileInfo *const tile_info,
-                                     int mi_row, int mi_col, int num_planes) {
+static inline void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+                                 MACROBLOCK *const x,
+                                 const TileInfo *const tile_info, int mi_row,
+                                 int mi_col, int num_planes) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
@@ -434,8 +434,8 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
   }
 }
 
-static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
-                                               int mi_row, int mi_col) {
+static inline void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                           int mi_row, int mi_col) {
   const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
   const int orig_rdmult = cpi->rd.RDMULT;
 
@@ -512,10 +512,10 @@ static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
  * rd-based searches are allowed to adjust the initial pattern. It is only used
  * by realtime encoding.
  */
-static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
-                                       TileDataEnc *tile_data, TokenExtra **tp,
-                                       const int mi_row, const int mi_col,
-                                       const int seg_skip) {
+static inline void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                   TileDataEnc *tile_data, TokenExtra **tp,
+                                   const int mi_row, const int mi_col,
+                                   const int seg_skip) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -765,10 +765,10 @@ static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
  * Conducts partition search for a superblock, based on rate-distortion costs,
  * from scratch or adjusting from a pre-calculated partition pattern.
  */
-static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
-                                    TileDataEnc *tile_data, TokenExtra **tp,
-                                    const int mi_row, const int mi_col,
-                                    const int seg_skip) {
+static inline void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                TileDataEnc *tile_data, TokenExtra **tp,
+                                const int mi_row, const int mi_col,
+                                const int seg_skip) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -961,7 +961,7 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
 }
 
 // Check if the cost update of symbols mode, coeff and dv are tile or off.
-static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+static inline int is_mode_coeff_dv_upd_freq_tile_or_off(
     const AV1_COMP *const cpi) {
   const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
 
@@ -974,7 +974,7 @@ static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
 // processing of current SB can start even before processing of top-right SB
 // is finished. This function checks if it is sufficient to wait for top SB
 // to finish processing before current SB starts processing.
-static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+static inline int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
   const MODE mode = cpi->oxcf.mode;
   if (mode == GOOD) return 0;
 
@@ -993,8 +993,8 @@ static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
  * \callgraph
  * \callergraph
  */
-static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
-                                             int mi_col) {
+static inline uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+                                         int mi_col) {
   if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -1035,9 +1035,9 @@ static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
  * \callgraph
  * \callergraph
  */
-static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
-                                                  MACROBLOCK *const x,
-                                                  int mi_row, int mi_col) {
+static inline bool is_calc_src_content_needed(AV1_COMP *cpi,
+                                              MACROBLOCK *const x, int mi_row,
+                                              int mi_col) {
   if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
     return true;
   const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
@@ -1086,10 +1086,9 @@ static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
  * \callergraph
  */
 // TODO(any): consolidate sfs to make interface cleaner
-static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
-                                               MACROBLOCK *const x,
-                                               TileDataEnc *tile_data,
-                                               int mi_row, int mi_col) {
+static inline void grade_source_content_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                                           TileDataEnc *tile_data, int mi_row,
+                                           int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   if (cm->current_frame.frame_type == KEY_FRAME ||
       (cpi->ppi->use_svc &&
@@ -1125,9 +1124,9 @@ static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
  * Do partition and mode search for an sb row: one row of superblocks filling up
  * the width of the current tile.
  */
-static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
-                                     TileDataEnc *tile_data, int mi_row,
-                                     TokenExtra **tp) {
+static inline void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+                                 TileDataEnc *tile_data, int mi_row,
+                                 TokenExtra **tp) {
   AV1_COMMON *const cm = &cpi->common;
   const TileInfo *const tile_info = &tile_data->tile_info;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
@@ -1272,7 +1271,7 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
 #endif
 }
 
-static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
+static inline void init_encode_frame_mb_context(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &cpi->td.mb;
@@ -1377,9 +1376,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
 }
 
 // Populate the start palette token info prior to encoding an SB row.
-static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
-                                       int tile_row, int tile_col, int mi_row,
-                                       TokenExtra **tp) {
+static inline void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
+                                   int tile_row, int tile_col, int mi_row,
+                                   TokenExtra **tp) {
   const TokenInfo *token_info = &cpi->token_info;
   if (!is_token_info_allocated(token_info)) return;
 
@@ -1396,10 +1395,10 @@ static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
 }
 
 // Populate the token count after encoding an SB row.
-static AOM_INLINE void populate_token_count(AV1_COMP *cpi,
-                                            const TileInfo *tile_info,
-                                            int tile_row, int tile_col,
-                                            int mi_row, TokenExtra *tok) {
+static inline void populate_token_count(AV1_COMP *cpi,
+                                        const TileInfo *tile_info, int tile_row,
+                                        int tile_col, int mi_row,
+                                        TokenExtra *tok) {
   const TokenInfo *token_info = &cpi->token_info;
   if (!is_token_info_allocated(token_info)) return;
 
@@ -1483,7 +1482,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
  *
  * \param[in]    cpi    Top-level encoder structure
  */
-static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
+static inline void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
@@ -1523,7 +1522,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
 }
 
 // Set the relative distance of a reference frame w.r.t. current frame
-static AOM_INLINE void set_rel_frame_dist(
+static inline void set_rel_frame_dist(
     const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
     const int ref_frame_flags) {
   MV_REFERENCE_FRAME ref_frame;
@@ -1615,7 +1614,7 @@ static int check_skip_mode_enabled(AV1_COMP *const cpi) {
   return 1;
 }
 
-static AOM_INLINE void set_default_interp_skip_flags(
+static inline void set_default_interp_skip_flags(
     const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
   const int num_planes = av1_num_planes(cm);
   interp_search_flags->default_interp_skip_flags =
@@ -1623,7 +1622,7 @@ static AOM_INLINE void set_default_interp_skip_flags(
                         : INTERP_SKIP_LUMA_SKIP_CHROMA;
 }
 
-static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+static inline void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
   if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
        cpi->sf.inter_sf.disable_onesided_comp) &&
       cpi->all_one_sided_refs) {
@@ -1758,7 +1757,7 @@ static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
  *
  * \ingroup high_level_algo
  */
-static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+static inline void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
   AV1_COMMON *const cm = &cpi->common;
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 4201728f63..f66cdcc135 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -217,9 +217,9 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
-                                                const MACROBLOCKD *xd,
-                                                const MB_MODE_INFO *mbmi) {
+static inline void update_filter_type_count(FRAME_COUNTS *counts,
+                                            const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi) {
   int dir;
   for (dir = 0; dir < 2; ++dir) {
     const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 477ed6f25f..6b6efac297 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -243,14 +243,14 @@ typedef struct {
 #endif  // CONFIG_COLLECT_PARTITION_STATS
 } PartitionSearchState;
 
-static AOM_INLINE void av1_disable_square_split_partition(
+static inline void av1_disable_square_split_partition(
     PartitionSearchState *part_state) {
   part_state->do_square_split = 0;
 }
 
 // Disables all possible rectangular splits. This includes PARTITION_AB4 as they
 // depend on the corresponding partition_rect_allowed.
-static AOM_INLINE void av1_disable_rect_partitions(
+static inline void av1_disable_rect_partitions(
     PartitionSearchState *part_state) {
   part_state->do_rectangular_split = 0;
   part_state->partition_rect_allowed[HORZ] = 0;
@@ -258,25 +258,23 @@ static AOM_INLINE void av1_disable_rect_partitions(
 }
 
 // Disables all possible splits so that only PARTITION_NONE *might* be allowed.
-static AOM_INLINE void av1_disable_all_splits(
-    PartitionSearchState *part_state) {
+static inline void av1_disable_all_splits(PartitionSearchState *part_state) {
   av1_disable_square_split_partition(part_state);
   av1_disable_rect_partitions(part_state);
 }
 
-static AOM_INLINE void av1_set_square_split_only(
-    PartitionSearchState *part_state) {
+static inline void av1_set_square_split_only(PartitionSearchState *part_state) {
   part_state->partition_none_allowed = 0;
   part_state->do_square_split = 1;
   av1_disable_rect_partitions(part_state);
 }
 
-static AOM_INLINE bool av1_blk_has_rows_and_cols(
+static inline bool av1_blk_has_rows_and_cols(
     const PartitionBlkParams *blk_params) {
   return blk_params->has_rows && blk_params->has_cols;
 }
 
-static AOM_INLINE bool av1_is_whole_blk_in_frame(
+static inline bool av1_is_whole_blk_in_frame(
     const PartitionBlkParams *blk_params,
     const CommonModeInfoParams *mi_params) {
   const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
@@ -285,9 +283,9 @@ static AOM_INLINE bool av1_is_whole_blk_in_frame(
          mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
 }
 
-static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
-                                              const MB_MODE_INFO *mbmi,
-                                              int dual_filter) {
+static inline void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                          const MB_MODE_INFO *mbmi,
+                                          int dual_filter) {
   for (int dir = 0; dir < 2; ++dir) {
     if (dir && !dual_filter) break;
     const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
@@ -297,8 +295,8 @@ static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
   }
 }
 
-static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, int segment_id) {
+static inline int set_rdmult(const AV1_COMP *const cpi,
+                             const MACROBLOCK *const x, int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const CommonQuantParams *quant_params = &cm->quant_params;
@@ -322,13 +320,13 @@ static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
       cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
 }
 
-static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
+static inline int do_split_check(BLOCK_SIZE bsize) {
   return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
 }
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
-                                                              int frm) {
+static inline const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+                                                          int frm) {
   assert(frm >= 0);
   if (frm < 0 ||
       p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
@@ -432,8 +430,7 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
 
 void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
 
-static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
-                                           int num_planes) {
+static inline void av1_dealloc_mb_data(struct macroblock *mb, int num_planes) {
   aom_free(mb->txfm_search_info.mb_rd_record);
   mb->txfm_search_info.mb_rd_record = NULL;
 
@@ -452,8 +449,8 @@ static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
   mb->dqcoeff_buf = NULL;
 }
 
-static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
-                                                  struct macroblock *mb) {
+static inline void allocate_winner_mode_stats(const AV1_COMP *cpi,
+                                              struct macroblock *mb) {
   const SPEED_FEATURES *sf = &cpi->sf;
   // The winner_mode_stats buffer is not required in these cases.
   if (is_stat_generation_stage(cpi) ||
@@ -471,8 +468,8 @@ static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
 
 void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
 
-static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
-                                         struct macroblock *mb) {
+static inline void av1_alloc_mb_data(const AV1_COMP *cpi,
+                                     struct macroblock *mb) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *sf = &cpi->sf;
   if (!sf->rt_sf.use_nonrd_pick_mode) {
@@ -504,7 +501,7 @@ static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
 
 // This function will compute the number of reference frames to be disabled
 // based on selective_ref_frame speed feature.
-static AOM_INLINE unsigned int get_num_refs_to_disable(
+static inline unsigned int get_num_refs_to_disable(
     const AV1_COMP *cpi, const int *ref_frame_flags,
     const unsigned int *ref_display_order_hint,
     unsigned int cur_frame_display_index) {
@@ -553,7 +550,7 @@ static inline int get_max_allowed_ref_frames(
 
 // Enforce the number of references for each arbitrary frame based on user
 // options and speed.
-static AOM_INLINE void enforce_max_ref_frames(
+static inline void enforce_max_ref_frames(
     AV1_COMP *cpi, int *ref_frame_flags,
     const unsigned int *ref_display_order_hint,
     unsigned int cur_frame_display_index) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f845f395cc..71c6e27bfa 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3483,7 +3483,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
 
 // Conditions to disable cdf_update mode in selective mode for real-time.
 // Handle case for layers, scene change, and resizing.
-static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
+static inline int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   // For single layer.
@@ -4448,7 +4448,7 @@ void print_internal_stats(AV1_PRIMARY *ppi) {
 }
 #endif  // CONFIG_INTERNAL_STATS
 
-static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+static inline void update_keyframe_counters(AV1_COMP *cpi) {
   if (cpi->common.show_frame && cpi->rc.frames_to_key) {
 #if !CONFIG_REALTIME_ONLY
     FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
@@ -4469,7 +4469,7 @@ static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
   }
 }
 
-static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+static inline void update_frames_till_gf_update(AV1_COMP *cpi) {
   // TODO(weitinglin): Updating this counter for is_frame_droppable
   // is a work-around to handle the condition when a frame is drop.
   // We should fix the cpi->common.show_frame flag
@@ -4482,7 +4482,7 @@ static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
   }
 }
 
-static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+static inline void update_gf_group_index(AV1_COMP *cpi) {
   // Increment the gf group index ready for the next frame.
   if (is_one_pass_rt_params(cpi) &&
       cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
@@ -4553,7 +4553,7 @@ static void update_end_of_frame_stats(AV1_COMP *cpi) {
 }
 
 // Updates frame level stats related to global motion
-static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) {
+static inline void update_gm_stats(AV1_COMP *cpi) {
   FRAME_UPDATE_TYPE update_type =
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
   int i, is_gm_present = 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index ef2d53ad8f..a50188c8ff 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3946,7 +3946,7 @@ static inline void init_ref_map_pair(
 }
 
 #if CONFIG_FPMT_TEST
-static AOM_INLINE void calc_frame_data_update_flag(
+static inline void calc_frame_data_update_flag(
     GF_GROUP *const gf_group, int gf_frame_index,
     bool *const do_frame_data_update) {
   *do_frame_data_update = true;
@@ -4050,7 +4050,7 @@ static inline int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
   return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
 }
 
-static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+static inline int can_disable_altref(const GFConfig *gf_cfg) {
   return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
          (gf_cfg->gf_min_pyr_height == 0);
 }
@@ -4080,7 +4080,7 @@ static inline int is_stat_consumption_stage(const AV1_COMP *const cpi) {
 }
 
 // Decide whether 'dv_costs' need to be allocated/stored during the encoding.
-static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
+static inline bool av1_need_dv_costs(const AV1_COMP *const cpi) {
   return !cpi->sf.rt_sf.use_nonrd_pick_mode &&
          av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi);
 }
@@ -4324,7 +4324,7 @@ static inline int av1_pixels_to_mi(int pixels) {
   return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
 }
 
-static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+static inline int is_psnr_calc_enabled(const AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
 
   return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 23e07db755..eb9e842019 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -25,14 +25,14 @@
 extern "C" {
 #endif
 
-static AOM_INLINE void dealloc_context_buffers_ext(
+static inline void dealloc_context_buffers_ext(
     MBMIExtFrameBufferInfo *mbmi_ext_info) {
   aom_free(mbmi_ext_info->frame_base);
   mbmi_ext_info->frame_base = NULL;
   mbmi_ext_info->alloc_size = 0;
 }
 
-static AOM_INLINE void alloc_context_buffers_ext(
+static inline void alloc_context_buffers_ext(
     AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
 
@@ -55,7 +55,7 @@ static AOM_INLINE void alloc_context_buffers_ext(
   mbmi_ext_info->stride = mi_alloc_cols;
 }
 
-static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+static inline void alloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   CommonModeInfoParams *const mi_params = &cm->mi_params;
 
@@ -90,7 +90,7 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
 
 // Allocate mbmi buffers which are used to store mode information at block
 // level.
-static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
+static inline void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   if (av1_alloc_context_buffers(cm, cm->width, cm->height,
                                 cpi->sf.part_sf.default_min_partition_size)) {
@@ -102,7 +102,7 @@ static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
     alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
 }
 
-static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+static inline void realloc_segmentation_maps(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CommonModeInfoParams *const mi_params = &cm->mi_params;
 
@@ -123,8 +123,8 @@ static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
                   aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
 }
 
-static AOM_INLINE void alloc_obmc_buffers(
-    OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+static inline void alloc_obmc_buffers(OBMCBuffer *obmc_buffer,
+                                      struct aom_internal_error_info *error) {
   AOM_CHECK_MEM_ERROR(
       error, obmc_buffer->wsrc,
       (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
@@ -141,7 +141,7 @@ static AOM_INLINE void alloc_obmc_buffers(
           16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
 }
 
-static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+static inline void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
   aom_free(obmc_buffer->mask);
   aom_free(obmc_buffer->above_pred);
   aom_free(obmc_buffer->left_pred);
@@ -153,7 +153,7 @@ static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
   obmc_buffer->wsrc = NULL;
 }
 
-static AOM_INLINE void alloc_compound_type_rd_buffers(
+static inline void alloc_compound_type_rd_buffers(
     struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
   AOM_CHECK_MEM_ERROR(
       error, bufs->pred0,
@@ -172,7 +172,7 @@ static AOM_INLINE void alloc_compound_type_rd_buffers(
                                             sizeof(*bufs->tmp_best_mask_buf)));
 }
 
-static AOM_INLINE void release_compound_type_rd_buffers(
+static inline void release_compound_type_rd_buffers(
     CompoundTypeRdBuffers *const bufs) {
   aom_free(bufs->pred0);
   aom_free(bufs->pred1);
@@ -182,7 +182,7 @@ static AOM_INLINE void release_compound_type_rd_buffers(
   av1_zero(*bufs);  // Set all pointers to NULL for safety.
 }
 
-static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+static inline void dealloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   TokenInfo *token_info = &cpi->token_info;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
@@ -376,7 +376,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   cpi->mb_delta_q = NULL;
 }
 
-static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
+static inline void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
   if (!is_gradient_caching_for_hog_enabled(cpi)) return;
 
   PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info;
@@ -392,7 +392,7 @@ static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
   cpi->td.mb.pixel_gradient_info = pixel_gradient_info;
 }
 
-static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
+static inline void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
   if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
 
   Block4x4VarInfo *source_variance_info =
@@ -409,7 +409,7 @@ static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
   cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info;
 }
 
-static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+static inline void variance_partition_alloc(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
   if (cpi->td.vt64x64) {
@@ -425,8 +425,9 @@ static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
   }
 }
 
-static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
-    AV1_COMP *cpi, int scaled_width, int scaled_height) {
+static inline YV12_BUFFER_CONFIG *realloc_and_scale_source(AV1_COMP *cpi,
+                                                           int scaled_width,
+                                                           int scaled_height) {
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
 
@@ -453,7 +454,7 @@ static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
 }
 
 // Deallocate allocated thread_data.
-static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+static inline void free_thread_data(AV1_PRIMARY *ppi) {
   PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
   const int num_tf_workers =
       AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 61c4b0a122..a4131cb7c9 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -39,7 +39,7 @@ extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
 
 // Mark all inactive blocks as active. Other segmentation features may be set
 // so memset cannot be used, instead only inactive blocks should be reset.
-static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+static inline void suppress_active_map(AV1_COMP *cpi) {
   unsigned char *const seg_map = cpi->enc_seg.map;
   int i;
   const int num_mis =
@@ -52,7 +52,7 @@ static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
 
 // Returns 'size' in the number of Mode Info (MI) units. 'size' is either the
 // width or height.
-static AOM_INLINE int size_in_mi(int size) {
+static inline int size_in_mi(int size) {
   // Ensure that the decoded width and height are both multiples of
   // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
   // subsampling is used).
@@ -62,8 +62,8 @@ static AOM_INLINE int size_in_mi(int size) {
   return aligned_size >> MI_SIZE_LOG2;
 }
 
-static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                                 int height) {
+static inline void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                             int height) {
   mi_params->mi_cols = size_in_mi(width);
   mi_params->mi_rows = size_in_mi(height);
   mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
@@ -80,7 +80,7 @@ static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
          mi_size_high[mi_params->mi_alloc_bsize]);
 }
 
-static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+static inline void enc_free_mi(CommonModeInfoParams *mi_params) {
   aom_free(mi_params->mi_alloc);
   mi_params->mi_alloc = NULL;
   mi_params->mi_alloc_size = 0;
@@ -91,24 +91,23 @@ static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
   mi_params->tx_type_map = NULL;
 }
 
-static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                                     int height,
-                                     BLOCK_SIZE min_partition_size) {
+static inline void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height, BLOCK_SIZE min_partition_size) {
   mi_params->mi_alloc_bsize = min_partition_size;
 
   set_mb_mi(mi_params, width, height);
 }
 
-static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
-                                            int width, int height,
-                                            BLOCK_SIZE min_partition_size) {
+static inline void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+                                        int width, int height,
+                                        BLOCK_SIZE min_partition_size) {
   (void)min_partition_size;
   mi_params->mi_alloc_bsize = BLOCK_16X16;
 
   set_mb_mi(mi_params, width, height);
 }
 
-static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+static inline void enc_setup_mi(CommonModeInfoParams *mi_params) {
   const int mi_grid_size =
       mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
   memset(mi_params->mi_alloc, 0,
@@ -119,7 +118,7 @@ static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
          mi_grid_size * sizeof(*mi_params->tx_type_map));
 }
 
-static AOM_INLINE void init_buffer_indices(
+static inline void init_buffer_indices(
     ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
   int fb_idx;
   for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
@@ -570,7 +569,7 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
 #endif
 
-static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+static inline void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
   SequenceHeader *const seq_params = &ppi->seq_params;
   if (seq_params->use_highbitdepth) {
     switch (seq_params->bit_depth) {
@@ -885,7 +884,7 @@ static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+static inline void copy_frame_prob_info(AV1_COMP *cpi) {
   FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
   if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
     av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
@@ -941,8 +940,8 @@ static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
 #endif
 }
 
-static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
-                                                   const CdefInfo *const src) {
+static inline void restore_cdef_coding_context(CdefInfo *const dst,
+                                               const CdefInfo *const src) {
   dst->cdef_bits = src->cdef_bits;
   dst->cdef_damping = src->cdef_damping;
   av1_copy(dst->cdef_strengths, src->cdef_strengths);
@@ -953,7 +952,7 @@ static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
 // Coding context that only needs to be restored when recode loop includes
 // filtering (deblocking, CDEF, superres post-encode upscale and/or loop
 // restoraton).
-static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+static inline void restore_extra_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
   cm->lf = cc->lf;
@@ -962,8 +961,8 @@ static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
   cpi->ppi->mv_stats = cc->mv_stats;
 }
 
-static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
-                                                  const YV12_BUFFER_CONFIG *b) {
+static inline int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                              const YV12_BUFFER_CONFIG *b) {
   return a->y_height == b->y_height && a->y_width == b->y_width &&
          a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
          a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
@@ -972,20 +971,19 @@ static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
              (b->flags & YV12_FLAG_HIGHBITDEPTH);
 }
 
-static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
-                                     bool *ext_refresh_frame_context_pending,
-                                     bool update) {
+static inline int update_entropy(bool *ext_refresh_frame_context,
+                                 bool *ext_refresh_frame_context_pending,
+                                 bool update) {
   *ext_refresh_frame_context = update;
   *ext_refresh_frame_context_pending = 1;
   return 0;
 }
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
-                                                   double max_factor,
-                                                   int prior_boost,
-                                                   int tpl_boost,
-                                                   int frames_to_key) {
+static inline int combine_prior_with_tpl_boost(double min_factor,
+                                               double max_factor,
+                                               int prior_boost, int tpl_boost,
+                                               int frames_to_key) {
   double factor = sqrt((double)frames_to_key);
   double range = max_factor - min_factor;
   factor = AOMMIN(factor, max_factor);
@@ -997,7 +995,7 @@ static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
 }
 #endif
 
-static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+static inline void set_size_independent_vars(AV1_COMP *cpi) {
   int i;
   AV1_COMMON *const cm = &cpi->common;
   FeatureFlags *const features = &cm->features;
@@ -1013,7 +1011,7 @@ static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
       features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc);
 }
 
-static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+static inline void release_scaled_references(AV1_COMP *cpi) {
   // Scaled references should only need to be released under certain conditions:
   // if the reference will be updated, or if the scaled reference has same
   // resolution. For now only apply this to Golden for non-svc RTC mode.
@@ -1037,12 +1035,12 @@ static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
   }
 }
 
-static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+static inline void restore_all_coding_context(AV1_COMP *cpi) {
   restore_extra_coding_context(cpi);
   if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
 }
 
-static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
+static inline int reduce_num_ref_buffers(const AV1_COMP *cpi) {
   const SequenceHeader *const seq_params = cpi->common.seq_params;
   return is_one_pass_rt_params(cpi) &&
          use_rtc_reference_structure_one_layer(cpi) &&
@@ -1051,7 +1049,7 @@ static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
 }
 
 // Refresh reference frame buffers according to refresh_frame_flags.
-static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+static inline void refresh_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   // All buffers are refreshed for shown keyframes and S-frames.
   // In case of RT, golden frame refreshes the 6th slot and other reference
@@ -1114,8 +1112,8 @@ void av1_save_all_coding_context(AV1_COMP *cpi);
 void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
 #endif
 
-static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra,
-                                              BLOCK_SIZE sb_size) {
+static inline int av1_get_enc_border_size(bool resize, bool all_intra,
+                                          BLOCK_SIZE sb_size) {
   // For allintra encoding mode, inter-frame motion search is not applicable and
   // the intraBC motion vectors are restricted within the tile boundaries. Hence
   // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
@@ -1128,7 +1126,7 @@ static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra,
   return block_size_wide[sb_size] + 32;
 }
 
-static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) {
+static inline bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) {
   const ResizeCfg *resize_cfg = &oxcf->resize_cfg;
   const SuperResCfg *superres_cfg = &oxcf->superres_cfg;
   return resize_cfg->resize_mode || superres_cfg->superres_mode;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 9b4b178c90..6d3171be3d 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -37,7 +37,7 @@
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
 
-static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+static inline void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
   td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
@@ -63,7 +63,7 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks;
 }
 
-static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+static inline void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   const int mib_size = cm->seq_params->mib_size;
@@ -233,7 +233,7 @@ void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
   }
 }
 
-static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+static inline int get_sb_rows_in_frame(AV1_COMMON *cm) {
   return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
                            cm->seq_params->mib_size_log2);
 }
@@ -302,8 +302,8 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
   enc_row_mt->allocated_sb_rows = 0;
 }
 
-static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
-                                             int num_tiles, int num_workers) {
+static inline void assign_tile_to_thread(int *thread_id_to_tile_id,
+                                         int num_tiles, int num_workers) {
   int tile_id = 0;
   int i;
 
@@ -313,8 +313,8 @@ static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
   }
 }
 
-static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
-                                   int *current_mi_row, int mib_size) {
+static inline int get_next_job(TileDataEnc *const tile_data,
+                               int *current_mi_row, int mib_size) {
   AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
   const int mi_row_end = tile_data->tile_info.mi_row_end;
 
@@ -327,7 +327,7 @@ static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
   return 0;
 }
 
-static AOM_INLINE void switch_tile_and_get_next_job(
+static inline void switch_tile_and_get_next_job(
     AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
     int *current_mi_row, int *end_of_frame, int is_firstpass,
     const BLOCK_SIZE fp_block_size) {
@@ -1124,8 +1124,8 @@ void av1_terminate_workers(AV1_PRIMARY *ppi) {
 
 // This function returns 1 if frame parallel encode is supported for
 // the current configuration. Returns 0 otherwise.
-static AOM_INLINE int is_fpmt_config(const AV1_PRIMARY *ppi,
-                                     const AV1EncoderConfig *oxcf) {
+static inline int is_fpmt_config(const AV1_PRIMARY *ppi,
+                                 const AV1EncoderConfig *oxcf) {
   // FPMT is enabled for AOM_Q and AOM_VBR.
   // TODO(Tarun): Test and enable resize config.
   if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
@@ -1196,7 +1196,7 @@ int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
 #define MAX_THREADS 100
 
 // Computes the max number of enc workers possible for each resolution.
-static AOM_INLINE int compute_max_num_enc_workers(
+static inline int compute_max_num_enc_workers(
     CommonModeInfoParams *const mi_params, int mib_size_log2) {
   int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2);
   int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2);
@@ -1253,25 +1253,26 @@ int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
 }
 
 // Computes the number of workers to process each of the parallel frames.
-static AOM_INLINE int compute_num_workers_per_frame(
+static inline int compute_num_workers_per_frame(
     const int num_workers, const int parallel_frame_count) {
   // Number of level 2 workers per frame context (floor division).
   int workers_per_frame = (num_workers / parallel_frame_count);
   return workers_per_frame;
 }
 
-static AOM_INLINE void restore_workers_after_fpmt(
-    AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared);
+static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
+                                              int parallel_frame_count,
+                                              int num_fpmt_workers_prepared);
 
 // Prepare level 1 workers. This function is only called for
 // parallel_frame_count > 1. This function populates the mt_info structure of
 // frame level contexts appropriately by dividing the total number of available
 // workers amongst the frames as level 2 workers. It also populates the hook and
 // data members of level 1 workers.
-static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
-                                            AV1_COMP_DATA *first_cpi_data,
-                                            AVxWorkerHook hook,
-                                            int parallel_frame_count) {
+static inline void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+                                        AV1_COMP_DATA *first_cpi_data,
+                                        AVxWorkerHook hook,
+                                        int parallel_frame_count) {
   assert(parallel_frame_count <= ppi->num_fp_contexts &&
          parallel_frame_count > 1);
 
@@ -1358,7 +1359,7 @@ static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
 }
 
 // Launch level 1 workers to perform frame parallel encode.
-static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
+static inline void launch_fpmt_workers(AV1_PRIMARY *ppi) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int num_workers = ppi->p_mt_info.p_num_workers;
 
@@ -1372,8 +1373,9 @@ static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
 }
 
 // Restore worker states after parallel encode.
-static AOM_INLINE void restore_workers_after_fpmt(
-    AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) {
+static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
+                                              int parallel_frame_count,
+                                              int num_fpmt_workers_prepared) {
   assert(parallel_frame_count <= ppi->num_fp_contexts &&
          parallel_frame_count > 1);
   (void)parallel_frame_count;
@@ -1413,8 +1415,8 @@ static AOM_INLINE void restore_workers_after_fpmt(
 }
 
 // Synchronize level 1 workers.
-static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
-                                         int frames_in_parallel_set) {
+static inline void sync_fpmt_workers(AV1_PRIMARY *ppi,
+                                     int frames_in_parallel_set) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int num_workers = ppi->p_mt_info.p_num_workers;
   int had_error = 0;
@@ -1468,8 +1470,8 @@ void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
                                 ref_buffers_used_map);
 }
 
-static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
-                                      int num_workers) {
+static inline void launch_workers(MultiThreadInfo *const mt_info,
+                                  int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
@@ -1481,8 +1483,8 @@ static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
   }
 }
 
-static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
-                                        AV1_COMMON *const cm, int num_workers) {
+static inline void sync_enc_workers(MultiThreadInfo *const mt_info,
+                                    AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const AVxWorker *const worker_main = &mt_info->workers[0];
   int had_error = worker_main->had_error;
@@ -1511,8 +1513,8 @@ static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
   xd->error_info = cm->error;
 }
 
-static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
-                                                       int num_workers) {
+static inline void accumulate_counters_enc_workers(AV1_COMP *cpi,
+                                                   int num_workers) {
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->mt_info.workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
@@ -1550,8 +1552,8 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
   }
 }
 
-static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
-                                           int num_workers) {
+static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                       int num_workers) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1_COMMON *const cm = &cpi->common;
   for (int i = num_workers - 1; i >= 0; i--) {
@@ -1655,8 +1657,8 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
-                                              int num_workers) {
+static inline void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                          int num_workers) {
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   for (int i = num_workers - 1; i >= 0; i--) {
@@ -1685,8 +1687,8 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
 #endif
 
 // Computes the number of workers for row multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
-                                                     int max_threads) {
+static inline int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
+                                                 int max_threads) {
   TileInfo tile_info;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
@@ -1704,8 +1706,8 @@ static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
 }
 
 // Computes the number of workers for tile multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
-                                                      int max_threads) {
+static inline int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
+                                                  int max_threads) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   return AOMMIN(max_threads, tile_cols * tile_rows);
@@ -1764,9 +1766,9 @@ void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
 
 // Computes the maximum number of sb rows and sb_cols across tiles which are
 // used to allocate memory for multi-threaded encoding with row-mt=1.
-static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm,
-                                                int *max_sb_rows_in_tile,
-                                                int *max_sb_cols_in_tile) {
+static inline void compute_max_sb_rows_cols(const AV1_COMMON *cm,
+                                            int *max_sb_rows_in_tile,
+                                            int *max_sb_cols_in_tile) {
   const int tile_rows = cm->tiles.rows;
   const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int num_mi_rows = cm->mi_params.mi_rows;
@@ -1820,8 +1822,8 @@ int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
 
 // Computes the maximum number of mb_rows for row multi-threading of firstpass
 // stage
-static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm,
-                                             BLOCK_SIZE fp_block_size) {
+static inline int fp_compute_max_mb_rows(const AV1_COMMON *cm,
+                                         BLOCK_SIZE fp_block_size) {
   const int tile_rows = cm->tiles.rows;
   const int unit_height_log2 = mi_size_high_log2[fp_block_size];
   const int mib_size_log2 = cm->seq_params->mib_size_log2;
@@ -2129,7 +2131,7 @@ void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+static inline void set_mode_estimation_done(AV1_COMP *cpi) {
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
   TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const BLOCK_SIZE bsize =
@@ -2260,8 +2262,8 @@ static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
 
 // Each worker is prepared by assigning the hook function and individual thread
 // data.
-static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
-                                           int num_workers) {
+static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                       int num_workers) {
   MultiThreadInfo *mt_info = &cpi->mt_info;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *worker = &mt_info->workers[i];
@@ -2366,8 +2368,8 @@ void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
 
 // Checks if a job is available. If job is available,
 // populates next_tf_row and returns 1, else returns 0.
-static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
-                                      int *current_mb_row, int mb_rows) {
+static inline int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+                                  int *current_mb_row, int mb_rows) {
   int do_next_row = 0;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
@@ -2519,8 +2521,7 @@ void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
 
 // Checks if a job is available in the current direction. If a job is available,
 // frame_idx will be populated and returns 1, else returns 0.
-static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
-                                      int cur_dir) {
+static inline int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) {
   GlobalMotionInfo *gm_info = &cpi->gm_info;
   GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
 
@@ -2537,8 +2538,8 @@ static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
 
 // Switches the current direction and calls the function get_next_gm_job() if
 // the speed feature 'prune_ref_frame_for_gm_search' is not set.
-static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
-                                        int *cur_dir) {
+static inline void switch_direction(AV1_COMP *cpi, int *frame_idx,
+                                    int *cur_dir) {
   if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
   // Switch the direction and get next job
   *cur_dir = !(*cur_dir);
@@ -2629,8 +2630,8 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
 }
 
 // Assigns global motion hook function and thread data to each worker.
-static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
-                                          int num_workers) {
+static inline void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                      int num_workers) {
   MultiThreadInfo *mt_info = &cpi->mt_info;
   mt_info->gm_sync.gm_mt_exit = false;
   for (int i = num_workers - 1; i >= 0; i--) {
@@ -2658,8 +2659,8 @@ static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
 }
 
 // Assigns available threads to past/future direction.
-static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
-                                            int num_workers) {
+static inline void assign_thread_to_dir(int8_t *thread_id_to_dir,
+                                        int num_workers) {
   int8_t frame_dir_idx = 0;
 
   for (int i = 0; i < num_workers; i++) {
@@ -2669,7 +2670,7 @@ static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
 }
 
 // Computes number of workers for global motion multi-threading.
-static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+static inline int compute_gm_workers(const AV1_COMP *cpi) {
   int total_refs =
       cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
   int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
@@ -2680,7 +2681,7 @@ static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
 }
 
 // Frees the memory allocated for each worker in global motion multi-threading.
-static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+static inline void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
   MultiThreadInfo *mt_info = &cpi->mt_info;
   for (int j = 0; j < num_workers; j++) {
     EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
@@ -2705,7 +2706,7 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static AOM_INLINE int get_next_job_allintra(
+static inline int get_next_job_allintra(
     AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end,
     int *current_mi_row, int mib_size) {
   if (row_mt_sync->next_mi_row < mi_row_end) {
@@ -2717,9 +2718,9 @@ static AOM_INLINE int get_next_job_allintra(
   return 0;
 }
 
-static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
-                                                  AVxWorkerHook hook,
-                                                  const int num_workers) {
+static inline void prepare_wiener_var_workers(AV1_COMP *const cpi,
+                                              AVxWorkerHook hook,
+                                              const int num_workers) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
@@ -2898,7 +2899,7 @@ static int compare_tile_order(const void *a, const void *b) {
 }
 
 // Get next tile index to be processed for pack bitstream
-static AOM_INLINE int get_next_pack_bs_tile_idx(
+static inline int get_next_pack_bs_tile_idx(
     AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
   assert(pack_bs_sync->next_job_idx <= num_tiles);
   if (pack_bs_sync->next_job_idx == num_tiles) return -1;
@@ -2909,11 +2910,10 @@ static AOM_INLINE int get_next_pack_bs_tile_idx(
 
 // Calculates bitstream chunk size based on total buffer size and tile or tile
 // group size.
-static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
-                                           const int frame_or_tg_size,
-                                           size_t *remain_buf_size,
-                                           size_t max_buf_size,
-                                           int is_last_chunk) {
+static inline size_t get_bs_chunk_size(int tg_or_tile_size,
+                                       const int frame_or_tg_size,
+                                       size_t *remain_buf_size,
+                                       size_t max_buf_size, int is_last_chunk) {
   size_t this_chunk_size;
   assert(*remain_buf_size > 0);
   if (is_last_chunk) {
@@ -3260,7 +3260,7 @@ static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
 }
 
 // Initializes cdef_sync parameters.
-static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+static inline void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
 #if CONFIG_MULTITHREAD
   if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
 #endif  // CONFIG_MULTITHREAD
@@ -3272,11 +3272,11 @@ static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
 
 // Checks if a job is available. If job is available,
 // populates next job information and returns 1, else returns 0.
-static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
-                                        CdefSearchCtx *cdef_search_ctx,
-                                        volatile int *cur_fbr,
-                                        volatile int *cur_fbc,
-                                        volatile int *sb_count) {
+static inline int cdef_get_next_job(AV1CdefSync *cdef_sync,
+                                    CdefSearchCtx *cdef_search_ctx,
+                                    volatile int *cur_fbr,
+                                    volatile int *cur_fbc,
+                                    volatile int *sb_count) {
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(cdef_sync->mutex_);
 #endif  // CONFIG_MULTITHREAD
@@ -3371,7 +3371,7 @@ void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
 }
 
 // Computes num_workers for temporal filter multi-threading.
-static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
+static inline int compute_num_tf_workers(const AV1_COMP *cpi) {
   // For single-pass encode, using no. of workers as per tf block size was not
   // found to improve speed. Hence the thread assignment for single-pass encode
   // is kept based on compute_num_enc_workers().
@@ -3388,33 +3388,33 @@ static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
 }
 
 // Computes num_workers for tpl multi-threading.
-static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+static inline int compute_num_tpl_workers(AV1_COMP *cpi) {
   return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop filter multi-threading.
-static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+static inline int compute_num_lf_workers(AV1_COMP *cpi) {
   return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for cdef multi-threading.
-static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+static inline int compute_num_cdef_workers(AV1_COMP *cpi) {
   return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop-restoration multi-threading.
-static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+static inline int compute_num_lr_workers(AV1_COMP *cpi) {
   return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for pack bitstream multi-threading.
-static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+static inline int compute_num_pack_bs_workers(AV1_COMP *cpi) {
   if (cpi->oxcf.max_threads <= 1) return 1;
   return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for all intra multi-threading.
-static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) {
+static inline int compute_num_ai_workers(AV1_COMP *cpi) {
   if (cpi->oxcf.max_threads <= 1) return 1;
   // The multi-threading implementation of deltaq-mode = 3 in allintra
   // mode is based on row multi threading.
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index c497eeba60..416ee42526 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -59,8 +59,8 @@
 
 #define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
 
-static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
-                                    struct aom_codec_pkt_list *pktlist) {
+static inline void output_stats(FIRSTPASS_STATS *stats,
+                                struct aom_codec_pkt_list *pktlist) {
   struct aom_codec_cx_pkt pkt;
   pkt.kind = AOM_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -261,9 +261,8 @@ static int get_search_range(int width, int height) {
   return sr;
 }
 
-static AOM_INLINE const search_site_config *
-av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
-                                      SEARCH_METHODS search_method) {
+static inline const search_site_config *av1_get_first_pass_search_site_config(
+    const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
   const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
 
   // For AVIF applications, even the source frames can have changing resolution,
@@ -290,10 +289,10 @@ av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
   return x->search_site_cfg_buf;
 }
 
-static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
-                                                const MV *ref_mv,
-                                                FULLPEL_MV *best_mv,
-                                                int *best_motion_err) {
+static inline void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                            const MV *ref_mv,
+                                            FULLPEL_MV *best_mv,
+                                            int *best_motion_err) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
@@ -391,7 +390,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   return raw_err_stdev;
 }
 
-static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+static inline int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
   return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
 }
 typedef struct intra_pred_block_pass1_args {
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index d085a35a3b..73a4e3c17f 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c
@@ -76,7 +76,7 @@ static int gm_get_params_cost(const WarpedMotionParams *gm,
 
 // For the given reference frame, computes the global motion parameters for
 // different motion models and finds the best.
-static AOM_INLINE void compute_global_motion_for_ref_frame(
+static inline void compute_global_motion_for_ref_frame(
     AV1_COMP *cpi, struct aom_internal_error_info *error_info,
     YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
     MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
@@ -193,7 +193,7 @@ void av1_compute_gm_for_valid_ref_frames(
 }
 
 // Loops over valid reference frames and computes global motion estimation.
-static AOM_INLINE void compute_global_motion_for_references(
+static inline void compute_global_motion_for_references(
     AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
     FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
     MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
@@ -270,7 +270,7 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
 
 // Populates valid reference frames in past/future directions in
 // 'reference_frames' and their count in 'num_ref_frames'.
-static AOM_INLINE void update_valid_ref_frames_for_gm(
+static inline void update_valid_ref_frames_for_gm(
     AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
     FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
     int *num_ref_frames) {
@@ -340,7 +340,7 @@ static AOM_INLINE void update_valid_ref_frames_for_gm(
 }
 
 // Initializes parameters used for computing global motion.
-static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+static inline void setup_global_motion_info_params(AV1_COMP *cpi) {
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
   YV12_BUFFER_CONFIG *source = cpi->source;
 
@@ -381,7 +381,7 @@ static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
 }
 
 // Computes global motion w.r.t. valid reference frames.
-static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+static inline void global_motion_estimation(AV1_COMP *cpi) {
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
   GlobalMotionData *gm_data = &cpi->td.gm_data;
 
diff --git a/av1/encoder/global_motion_facade.h b/av1/encoder/global_motion_facade.h
index 02dd0f5250..3e22f3bcba 100644
--- a/av1/encoder/global_motion_facade.h
+++ b/av1/encoder/global_motion_facade.h
@@ -19,7 +19,7 @@ struct yv12_buffer_config;
 struct AV1_COMP;
 
 // Allocates memory for members of GlobalMotionData.
-static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+static inline void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
   AV1_COMMON *cm = &cpi->common;
   GlobalMotionInfo *gm_info = &cpi->gm_info;
 
@@ -36,7 +36,7 @@ static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
 }
 
 // Deallocates the memory allocated for members of GlobalMotionData.
-static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+static inline void gm_dealloc_data(GlobalMotionData *gm_data) {
   aom_free(gm_data->segment_map);
   gm_data->segment_map = NULL;
   for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index 344c990005..d4b74a648d 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c
@@ -59,7 +59,7 @@ static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
 }
 
 // Sets the GF_GROUP params for LF_UPDATE frames.
-static AOM_INLINE void set_params_for_leaf_frames(
+static inline void set_params_for_leaf_frames(
     const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
     const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
@@ -93,7 +93,7 @@ static AOM_INLINE void set_params_for_leaf_frames(
 }
 
 // Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
-static AOM_INLINE void set_params_for_intnl_overlay_frames(
+static inline void set_params_for_intnl_overlay_frames(
     GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
     int *first_frame_index, int *cur_disp_index, int layer_depth) {
   gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
@@ -111,7 +111,7 @@ static AOM_INLINE void set_params_for_intnl_overlay_frames(
 }
 
 // Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
-static AOM_INLINE void set_params_for_internal_arfs(
+static inline void set_params_for_internal_arfs(
     const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
     const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
@@ -279,10 +279,9 @@ typedef struct {
 } FRAME_REORDER_INFO;
 
 // Updates the stats required to configure the GF_GROUP.
-static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
-                                            int arf_frame_index,
-                                            int display_idx, int start,
-                                            int end) {
+static inline void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+                                        int arf_frame_index, int display_idx,
+                                        int start, int end) {
   arf_frame_stats[arf_frame_index].start = start;
   arf_frame_stats[arf_frame_index].end = end;
   arf_frame_stats[arf_frame_index].display_index = display_idx;
@@ -290,7 +289,7 @@ static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
 
 // Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
 // doh_gf_index_map and arf_frame_stats.
-static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+static inline void set_params_for_internal_arfs_in_gf14(
     GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
     int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
     int *count_arf_frames, int *doh_gf_index_map, int start, int end,
@@ -333,7 +332,7 @@ static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
 
 // Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
 // dpeth.
-static AOM_INLINE void set_params_for_cur_layer_frames(
+static inline void set_params_for_cur_layer_frames(
     GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
     int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
     int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
@@ -373,7 +372,7 @@ static AOM_INLINE void set_params_for_cur_layer_frames(
 
 // Configures multi-layers of the GF_GROUP when consecutive encode of frames in
 // the same layer depth is enbaled.
-static AOM_INLINE void set_multi_layer_params_for_gf14(
+static inline void set_multi_layer_params_for_gf14(
     const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
     const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 3e9ae6bac3..f5810de9c6 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -399,7 +399,7 @@ void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
   }
 }
 
-static AOM_INLINE int get_model_rd_index_for_pruning(
+static inline int get_model_rd_index_for_pruning(
     const MACROBLOCK *const x,
     const INTRA_MODE_SPEED_FEATURES *const intra_sf) {
   const int top_intra_model_count_allowed =
@@ -641,8 +641,8 @@ static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
   return est_best_cfl_idx;
 }
 
-static AOM_INLINE void set_invalid_cfl_parameters(
-    uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) {
+static inline void set_invalid_cfl_parameters(uint8_t *best_cfl_alpha_idx,
+                                              int8_t *best_cfl_alpha_signs) {
   *best_cfl_alpha_idx = 0;
   *best_cfl_alpha_signs = 0;
 }
@@ -1160,13 +1160,12 @@ void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
  *
  * \return Returns whether the current mode is an improvement over best_rd.
  */
-static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, const int *bmode_costs,
-                                      int64_t *best_rd, int *rate,
-                                      int *rate_tokenonly, int64_t *distortion,
-                                      uint8_t *skippable,
-                                      MB_MODE_INFO *best_mbmi,
-                                      PICK_MODE_CONTEXT *ctx) {
+static inline int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, const int *bmode_costs,
+                                  int64_t *best_rd, int *rate,
+                                  int *rate_tokenonly, int64_t *distortion,
+                                  uint8_t *skippable, MB_MODE_INFO *best_mbmi,
+                                  PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   RD_STATS rd_stats;
@@ -1436,7 +1435,7 @@ int av1_search_intra_uv_modes_in_interframe(
 
 // Checks if odd delta angles can be pruned based on rdcosts of even delta
 // angles of the corresponding directional mode.
-static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost(
+static inline int prune_luma_odd_delta_angles_using_rd_cost(
     const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost,
     int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) {
   const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index c040797940..c03246a555 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -286,7 +286,7 @@ void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
 
 /*! \brief Initializes the \ref IntraModeSearchState struct.
  */
-static AOM_INLINE void init_intra_mode_search_state(
+static inline void init_intra_mode_search_state(
     IntraModeSearchState *intra_search_state) {
   memset(intra_search_state, 0, sizeof(*intra_search_state));
   intra_search_state->rate_uv_intra = INT_MAX;
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 7940757213..720aec2a14 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -103,7 +103,7 @@ static const NN_CONFIG av1_intra_hog_model_nnconfig = {
 };
 
 #define FIX_PREC_BITS (16)
-static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+static inline int get_hist_bin_idx(int dx, int dy) {
   const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
 
   // Find index by bisection
@@ -142,12 +142,12 @@ static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
 #undef FIX_PREC_BITS
 
 // Normalizes the hog data.
-static AOM_INLINE void normalize_hog(float total, float *hist) {
+static inline void normalize_hog(float total, float *hist) {
   for (int i = 0; i < BINS; ++i) hist[i] /= total;
 }
 
-static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
-                                          int rows, int cols, float *hist) {
+static inline void lowbd_generate_hog(const uint8_t *src, int stride, int rows,
+                                      int cols, float *hist) {
   float total = 0.1f;
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
@@ -182,9 +182,9 @@ static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
 
 // Computes and stores pixel level gradient information of a given superblock
 // for LBD encode.
-static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
-                                                      BLOCK_SIZE sb_size,
-                                                      PLANE_TYPE plane) {
+static inline void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                  BLOCK_SIZE sb_size,
+                                                  PLANE_TYPE plane) {
   PixelLevelGradientInfo *const grad_info_sb =
       x->pixel_gradient_info + plane * MAX_SB_SQUARE;
   const uint8_t *src = x->plane[plane].src.buf;
@@ -216,8 +216,8 @@ static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
-                                           int rows, int cols, float *hist) {
+static inline void highbd_generate_hog(const uint8_t *src8, int stride,
+                                       int rows, int cols, float *hist) {
   float total = 0.1f;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   src += stride;
@@ -253,9 +253,9 @@ static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
 
 // Computes and stores pixel level gradient information of a given superblock
 // for HBD encode.
-static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
-                                                       BLOCK_SIZE sb_size,
-                                                       PLANE_TYPE plane) {
+static inline void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                   BLOCK_SIZE sb_size,
+                                                   PLANE_TYPE plane) {
   PixelLevelGradientInfo *const grad_info_sb =
       x->pixel_gradient_info + plane * MAX_SB_SQUARE;
   const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
@@ -287,8 +287,8 @@ static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
-                                    int cols, float *hist, int highbd) {
+static inline void generate_hog(const uint8_t *src8, int stride, int rows,
+                                int cols, float *hist, int highbd) {
 #if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     highbd_generate_hog(src8, stride, rows, cols, hist);
@@ -300,9 +300,9 @@ static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
   lowbd_generate_hog(src8, stride, rows, cols, hist);
 }
 
-static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
-                                                BLOCK_SIZE sb_size,
-                                                PLANE_TYPE plane) {
+static inline void compute_gradient_info_sb(MACROBLOCK *const x,
+                                            BLOCK_SIZE sb_size,
+                                            PLANE_TYPE plane) {
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(&x->e_mbd)) {
     highbd_compute_gradient_info_sb(x, sb_size, plane);
@@ -324,7 +324,7 @@ static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
 // (1) Inter frames (due to early intra gating)
 // (2) When partition_search_type is not SEARCH_PARTITION
 // Hence, gradient data is computed at block level in such cases.
-static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
+static inline bool is_gradient_caching_for_hog_enabled(
     const AV1_COMP *const cpi) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode &&
@@ -336,9 +336,9 @@ static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
 // Function to generate pixel level gradient information for a given superblock.
 // Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
 // gradient info is generated for the same.
-static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
-                                                BLOCK_SIZE sb_size, int mi_row,
-                                                int mi_col) {
+static inline void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+                                            BLOCK_SIZE sb_size, int mi_row,
+                                            int mi_col) {
   // Initialise flags related to hog data caching.
   x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
   x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
@@ -361,11 +361,11 @@ static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
 
 // Reuses the pixel level gradient data generated at superblock level for block
 // level histogram computation.
-static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
-                                                         int rows, int cols,
-                                                         BLOCK_SIZE sb_size,
-                                                         PLANE_TYPE plane,
-                                                         float *hist) {
+static inline void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+                                                     int rows, int cols,
+                                                     BLOCK_SIZE sb_size,
+                                                     PLANE_TYPE plane,
+                                                     float *hist) {
   float total = 0.1f;
   const int ss_x = x->e_mbd.plane[plane].subsampling_x;
   const int ss_y = x->e_mbd.plane[plane].subsampling_y;
@@ -434,7 +434,7 @@ static inline void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
   }
 }
 
-static AOM_INLINE void prune_intra_mode_with_hog(
+static inline void prune_intra_mode_with_hog(
     const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
     uint8_t *directional_mode_skip_mask, int is_chroma) {
   const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
@@ -457,7 +457,7 @@ int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
                                  const int stride, const int is_hbd);
 
 // Returns whether caching of source variance for 4x4 sub-blocks is allowed.
-static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
+static inline bool is_src_var_for_4x4_sub_blocks_caching_enabled(
     const AV1_COMP *const cpi) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   if (cpi->oxcf.mode != ALLINTRA) return false;
@@ -473,7 +473,7 @@ static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
 
 // Initialize the members of Block4x4VarInfo structure to -1 at the start
 // of every superblock.
-static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
+static inline void init_src_var_info_of_4x4_sub_blocks(
     const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks,
     const BLOCK_SIZE sb_size) {
   if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
@@ -486,7 +486,7 @@ static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
 }
 
 // Returns the cost needed to send a uniformly distributed r.v.
-static AOM_INLINE int write_uniform_cost(int n, int v) {
+static inline int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return 0;
@@ -501,11 +501,11 @@ static AOM_INLINE int write_uniform_cost(int n, int v) {
  *
  * \callergraph
  */
-static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
-                                             const MACROBLOCK *x,
-                                             const MB_MODE_INFO *mbmi,
-                                             BLOCK_SIZE bsize, int mode_cost,
-                                             int discount_color_cost) {
+static inline int intra_mode_info_cost_y(const AV1_COMP *cpi,
+                                         const MACROBLOCK *x,
+                                         const MB_MODE_INFO *mbmi,
+                                         BLOCK_SIZE bsize, int mode_cost,
+                                         int discount_color_cost) {
   int total_rate = mode_cost;
   const ModeCosts *mode_costs = &x->mode_costs;
   const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
@@ -569,10 +569,10 @@ static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
  *
  * \callergraph
  */
-static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
-                                              const MACROBLOCK *x,
-                                              const MB_MODE_INFO *mbmi,
-                                              BLOCK_SIZE bsize, int mode_cost) {
+static inline int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+                                          const MACROBLOCK *x,
+                                          const MB_MODE_INFO *mbmi,
+                                          BLOCK_SIZE bsize, int mode_cost) {
   int total_rate = mode_cost;
   const ModeCosts *mode_costs = &x->mode_costs;
   const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
@@ -666,9 +666,9 @@ static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
  *
  * \return Returns 1 if the given mode is prune; 0 otherwise.
  */
-static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
-                                                MACROBLOCK *x, BLOCK_SIZE bsize,
-                                                int64_t *best_model_rd) {
+static inline int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                            MACROBLOCK *x, BLOCK_SIZE bsize,
+                                            int64_t *best_model_rd) {
   const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
   const int plane = 0;
   const AV1_COMMON *cm = &cpi->common;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cfe1dddebf..a66a1e4cf8 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -854,13 +854,13 @@ static AOM_FORCE_INLINE void calc_int_sad_list(
 //   If the current sad is lower than the current best sad.
 // Returns:
 //   Whether the input sad (mv) is better than the current best.
-static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
-                                         const FULLPEL_MV *mv,
-                                         const MV_COST_PARAMS *mv_cost_params,
-                                         unsigned int *best_sad,
-                                         unsigned int *raw_best_sad,
-                                         FULLPEL_MV *best_mv,
-                                         FULLPEL_MV *second_best_mv) {
+static inline int update_mvs_and_sad(const unsigned int this_sad,
+                                     const FULLPEL_MV *mv,
+                                     const MV_COST_PARAMS *mv_cost_params,
+                                     unsigned int *best_sad,
+                                     unsigned int *raw_best_sad,
+                                     FULLPEL_MV *best_mv,
+                                     FULLPEL_MV *second_best_mv) {
   if (this_sad >= *best_sad) return 0;
 
   // Add the motion vector cost.
@@ -877,7 +877,7 @@ static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
 
 // Calculate sad4 and update the bestmv information
 // in FAST_DIAMOND search method.
-static AOM_INLINE void calc_sad4_update_bestmv(
+static inline void calc_sad4_update_bestmv(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
     const FULLPEL_MV center_mv, const uint8_t *center_address,
@@ -914,7 +914,7 @@ static AOM_INLINE void calc_sad4_update_bestmv(
   }
 }
 
-static AOM_INLINE void calc_sad3_update_bestmv(
+static inline void calc_sad3_update_bestmv(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
     FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
@@ -950,7 +950,7 @@ static AOM_INLINE void calc_sad3_update_bestmv(
 
 // Calculate sad and update the bestmv information
 // in FAST_DIAMOND search method.
-static AOM_INLINE void calc_sad_update_bestmv(
+static inline void calc_sad_update_bestmv(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
     const FULLPEL_MV center_mv, const uint8_t *center_address,
@@ -976,7 +976,7 @@ static AOM_INLINE void calc_sad_update_bestmv(
   }
 }
 
-static AOM_INLINE void calc_sad_update_bestmv_with_indices(
+static inline void calc_sad_update_bestmv_with_indices(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
     const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
     const FULLPEL_MV center_mv, const uint8_t *center_address,
@@ -2966,8 +2966,8 @@ static inline int is_cost_list_wellbehaved(const int *cost_list) {
 // x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
 // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
 // The code below is an integerized version of that.
-static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
-                                         int bits) {
+static inline void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+                                     int bits) {
   *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
                          (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
   *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
@@ -2989,7 +2989,7 @@ static inline int check_repeated_mv_and_update(int_mv *last_mv_search_list,
   return 0;
 }
 
-static AOM_INLINE int setup_center_error_facade(
+static inline int setup_center_error_facade(
     MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index e91e15c0b6..6c3e9fbee5 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -203,7 +203,7 @@ static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
 };
 
 // Reinitialize the search site config.
-static AOM_INLINE void av1_refresh_search_site_config(
+static inline void av1_refresh_search_site_config(
     search_site_config *ss_cfg_buf, SEARCH_METHODS search_method,
     const int ref_stride) {
   const int level =
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index d96e5ec129..08b81dadfd 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -67,8 +67,8 @@ static int64_t calculate_sse(MACROBLOCKD *const xd,
   return sse;
 }
 
-static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
-                                            int plane, const BLOCK_SIZE bsize) {
+static inline int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+                                        int plane, const BLOCK_SIZE bsize) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
@@ -82,11 +82,11 @@ static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
   return sse;
 }
 
-static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
-                                         const MACROBLOCK *const x,
-                                         BLOCK_SIZE plane_bsize, int plane,
-                                         int64_t sse, int num_samples,
-                                         int *rate, int64_t *dist) {
+static inline void model_rd_from_sse(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     BLOCK_SIZE plane_bsize, int plane,
+                                     int64_t sse, int num_samples, int *rate,
+                                     int64_t *dist) {
   (void)num_samples;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -114,11 +114,11 @@ static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
 
 // Fits a curve for rate and distortion using as feature:
 // log2(sse_norm/qstep^2)
-static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
-                                             const MACROBLOCK *const x,
-                                             BLOCK_SIZE plane_bsize, int plane,
-                                             int64_t sse, int num_samples,
-                                             int *rate, int64_t *dist) {
+static inline void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                         const MACROBLOCK *const x,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         int64_t sse, int num_samples,
+                                         int *rate, int64_t *dist) {
   (void)cpi;
   (void)plane_bsize;
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -155,11 +155,13 @@ static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
   if (dist) *dist = dist_i;
 }
 
-static AOM_INLINE void model_rd_for_sb(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
-    uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
-    int64_t *plane_sse, int64_t *plane_dist) {
+static inline void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                   MACROBLOCK *x, MACROBLOCKD *xd,
+                                   int plane_from, int plane_to,
+                                   int *out_rate_sum, int64_t *out_dist_sum,
+                                   uint8_t *skip_txfm_sb, int64_t *skip_sse_sb,
+                                   int *plane_rate, int64_t *plane_sse,
+                                   int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -207,7 +209,7 @@ static AOM_INLINE void model_rd_for_sb(
   *out_dist_sum = dist_sum;
 }
 
-static AOM_INLINE void model_rd_for_sb_with_curvfit(
+static inline void model_rd_for_sb_with_curvfit(
     const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
     int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
     uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index b03ed34c32..c45600291a 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -842,10 +842,9 @@ int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
-                                               MACROBLOCK *x, BLOCK_SIZE bsize,
-                                               const MV *other_mv, int ref_idx,
-                                               uint8_t *second_pred) {
+static inline void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, const MV *other_mv,
+                                           int ref_idx, uint8_t *second_pred) {
   const AV1_COMMON *const cm = &cpi->common;
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
@@ -904,7 +903,7 @@ int av1_compound_single_motion_search_interinter(
                                            mask, mask_stride, rate_mv, ref_idx);
 }
 
-static AOM_INLINE void do_masked_motion_search_indexed(
+static inline void do_masked_motion_search_indexed(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
     int_mv *tmp_mv, int *rate_mv, int which) {
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index a4fe262474..8cf853c29f 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -68,7 +68,7 @@ int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
                                         int num_planes, int use_subpixel,
                                         unsigned int *sse, unsigned int *var);
 
-static AOM_INLINE const search_site_config *av1_get_search_site_config(
+static inline const search_site_config *av1_get_search_site_config(
     const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
   const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
 
@@ -97,8 +97,8 @@ static AOM_INLINE const search_site_config *av1_get_search_site_config(
   return x->search_site_cfg_buf;
 }
 
-static AOM_INLINE SEARCH_METHODS
-av1_get_faster_search_method(SEARCH_METHODS search_method) {
+static inline SEARCH_METHODS av1_get_faster_search_method(
+    SEARCH_METHODS search_method) {
   // Note on search method's accuracy:
   //  1. NSTEP
   //  2. DIAMOND
@@ -121,7 +121,7 @@ av1_get_faster_search_method(SEARCH_METHODS search_method) {
   }
 }
 
-static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+static inline SEARCH_METHODS av1_get_default_mv_search_method(
     const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
   SEARCH_METHODS search_method = mv_sf->search_method;
   const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index e9aeb07785..748bb907fe 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c
@@ -18,7 +18,7 @@
 #include "av1/encoder/mv_prec.h"
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+static inline int_mv get_ref_mv_for_mv_stats(
     const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
     int ref_idx) {
   int ref_mv_idx = mbmi->ref_mv_idx;
@@ -43,7 +43,7 @@ static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
              : mbmi_ext_frame->global_mvs[ref_frame_type];
 }
 
-static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+static inline int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
   const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
   const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
   const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
@@ -51,9 +51,9 @@ static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
   return av1_cost_symbol(p15);
 }
 
-static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
-                                         int comp_idx, const AV1_COMP *cpi,
-                                         int *rates) {
+static inline int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+                                     int comp_idx, const AV1_COMP *cpi,
+                                     int *rates) {
   assert(comp_val != 0 && "mv component should not have zero value!");
   const int sign = comp_val < 0;
   const int mag = sign ? -comp_val : comp_val;
@@ -119,8 +119,8 @@ static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
   return total_rate;
 }
 
-static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
-                                        const MV *cur_mv, const AV1_COMP *cpi) {
+static inline void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+                                    const MV *cur_mv, const AV1_COMP *cpi) {
   const MACROBLOCK *const x = &cpi->td.mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -170,9 +170,8 @@ static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
   }
 }
 
-static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
-                                          const AV1_COMP *cpi, int mi_row,
-                                          int mi_col) {
+static inline void collect_mv_stats_b(MV_STATS *mv_stats, const AV1_COMP *cpi,
+                                      int mi_row, int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
 
@@ -260,9 +259,9 @@ static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
 }
 
 // Split block
-static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
-                                           const AV1_COMP *cpi, int mi_row,
-                                           int mi_col, BLOCK_SIZE bsize) {
+static inline void collect_mv_stats_sb(MV_STATS *mv_stats, const AV1_COMP *cpi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
   assert(bsize < BLOCK_SIZES_ALL);
   const AV1_COMMON *cm = &cpi->common;
 
@@ -328,9 +327,9 @@ static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
   }
 }
 
-static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
-                                             const AV1_COMP *cpi,
-                                             const TileInfo *tile_info) {
+static inline void collect_mv_stats_tile(MV_STATS *mv_stats,
+                                         const AV1_COMP *cpi,
+                                         const TileInfo *tile_info) {
   const AV1_COMMON *cm = &cpi->common;
   const int mi_row_start = tile_info->mi_row_start;
   const int mi_row_end = tile_info->mi_row_end;
@@ -368,8 +367,8 @@ void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
   mv_stats->valid = 1;
 }
 
-static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
-                                        int current_q) {
+static inline int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+                                    int current_q) {
   const AV1_COMMON *cm = &cpi->common;
   const int order_hint = cpi->common.current_frame.order_hint;
   const int order_diff = order_hint - mv_stats->order;
diff --git a/av1/encoder/mv_prec.h b/av1/encoder/mv_prec.h
index 2d022d59f3..5fd03154e9 100644
--- a/av1/encoder/mv_prec.h
+++ b/av1/encoder/mv_prec.h
@@ -20,7 +20,7 @@
 #if !CONFIG_REALTIME_ONLY
 void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
 
-static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+static inline int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
   const int gf_group_index = cpi->gf_frame_index;
   const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
   return !frame_is_intra_only(&cpi->common) &&
@@ -29,9 +29,9 @@ static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static AOM_INLINE void av1_set_high_precision_mv(
-    AV1_COMP *cpi, int allow_high_precision_mv,
-    int cur_frame_force_integer_mv) {
+static inline void av1_set_high_precision_mv(AV1_COMP *cpi,
+                                             int allow_high_precision_mv,
+                                             int cur_frame_force_integer_mv) {
   MvCosts *const mv_costs = cpi->td.mb.mv_costs;
   // Avoid accessing 'mv_costs' when it is not allocated.
   if (mv_costs == NULL) return;
diff --git a/av1/encoder/nonrd_opt.c b/av1/encoder/nonrd_opt.c
index 909c67ca38..5cb437d122 100644
--- a/av1/encoder/nonrd_opt.c
+++ b/av1/encoder/nonrd_opt.c
@@ -331,9 +331,8 @@ void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
 // av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
 // potential room of improvement for writing AVX2 optimization is only 3% * 8% =
 // 0.24% of total encoding time.
-static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width,
-                                             const int16_t *src,
-                                             int src_stride) {
+static inline void scale_square_buf_vals(int16_t *dst, int tx_width,
+                                         const int16_t *src, int src_stride) {
 #define DO_SCALING                                                   \
   do {                                                               \
     for (int idy = 0; idy < tx_width; ++idy) {                       \
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index a856c95941..db5c56c759 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1265,9 +1265,9 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
 }
 #if !CONFIG_REALTIME_ONLY
 
-static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
-                                             MACROBLOCK *const x,
-                                             const MB_MODE_INFO *mbmi) {
+static inline int is_warped_mode_allowed(const AV1_COMP *cpi,
+                                         MACROBLOCK *const x,
+                                         const MB_MODE_INFO *mbmi) {
   const FeatureFlags *const features = &cpi->common.features;
   const MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1470,9 +1470,8 @@ static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
 
 #if COLLECT_NONRD_PICK_MODE_STAT
 
-static AOM_INLINE void print_stage_time(const char *stage_name,
-                                        int64_t stage_time,
-                                        int64_t total_time) {
+static inline void print_stage_time(const char *stage_name, int64_t stage_time,
+                                    int64_t total_time) {
   printf("    %s: %ld (%f%%)\n", stage_name, stage_time,
          100 * stage_time / (float)total_time);
 }
@@ -1714,7 +1713,7 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
 #endif  // CONFIG_INTERNAL_STATS
 }
 
-static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+static inline int is_same_gf_and_last_scale(AV1_COMMON *cm) {
   struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
   struct scale_factors *const sf_golden =
       get_ref_scale_factors(cm, GOLDEN_FRAME);
@@ -1722,12 +1721,12 @@ static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
           (sf_last->y_scale_fp == sf_golden->y_scale_fp));
 }
 
-static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
-                                              MB_MODE_INFO *mi, int mi_row,
-                                              int mi_col, BLOCK_SIZE bsize,
-                                              int gf_temporal_ref,
-                                              int use_ref_frame[],
-                                              int *force_skip_low_temp_var) {
+static inline void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+                                          MB_MODE_INFO *mi, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize,
+                                          int gf_temporal_ref,
+                                          int use_ref_frame[],
+                                          int *force_skip_low_temp_var) {
   AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
   const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
@@ -1846,9 +1845,11 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
   assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
 }
 
-static AOM_INLINE int is_filter_search_enabled_blk(
-    AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
-    int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
+static inline int is_filter_search_enabled_blk(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col,
+                                               BLOCK_SIZE bsize, int segment_id,
+                                               int cb_pred_filter_search,
+                                               InterpFilter *filt_select) {
   const AV1_COMMON *const cm = &cpi->common;
   // filt search disabled
   if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
@@ -1886,11 +1887,13 @@ static AOM_INLINE int is_filter_search_enabled_blk(
   return enable_interp_search;
 }
 
-static AOM_INLINE int skip_mode_by_threshold(
-    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
-    int frames_since_golden, const int *const rd_threshes,
-    const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
-    int extra_shift) {
+static inline int skip_mode_by_threshold(PREDICTION_MODE mode,
+                                         MV_REFERENCE_FRAME ref_frame,
+                                         int_mv mv, int frames_since_golden,
+                                         const int *const rd_threshes,
+                                         const int *const rd_thresh_freq_fact,
+                                         int64_t best_cost, int best_skip,
+                                         int extra_shift) {
   int skip_this_mode = 0;
   const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
   int64_t mode_rd_thresh =
@@ -1912,7 +1915,7 @@ static AOM_INLINE int skip_mode_by_threshold(
   return skip_this_mode;
 }
 
-static AOM_INLINE int skip_mode_by_low_temp(
+static inline int skip_mode_by_low_temp(
     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
     CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
   // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
@@ -1929,7 +1932,7 @@ static AOM_INLINE int skip_mode_by_low_temp(
   return 0;
 }
 
-static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+static inline int skip_mode_by_bsize_and_ref_frame(
     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
     int extra_prune, unsigned int sse_zeromv_norm, int more_prune,
     int skip_nearmv) {
@@ -2147,7 +2150,7 @@ static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
   }
 }
 
-static AOM_INLINE bool is_globalmv_better(
+static inline bool is_globalmv_better(
     PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv,
     const ModeCosts *mode_costs,
     const int (*single_inter_mode_costs)[REF_FRAMES],
@@ -2166,7 +2169,7 @@ static AOM_INLINE bool is_globalmv_better(
 
 // Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it
 // succeeds, 0 if it fails.
-static AOM_INLINE int setup_compound_params_from_comp_idx(
+static inline int setup_compound_params_from_comp_idx(
     const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE],
     PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame,
     MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
@@ -2223,7 +2226,7 @@ static AOM_INLINE int setup_compound_params_from_comp_idx(
   return 1;
 }
 
-static AOM_INLINE bool previous_mode_performed_poorly(
+static inline bool previous_mode_performed_poorly(
     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
     const unsigned int (*vars)[REF_FRAMES],
     const int64_t (*uv_dist)[REF_FRAMES]) {
@@ -2244,7 +2247,7 @@ static AOM_INLINE bool previous_mode_performed_poorly(
   return var_bad;
 }
 
-static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+static inline bool prune_compoundmode_with_singlemode_var(
     PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
     MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
     const uint8_t (*mode_checked)[REF_FRAMES],
@@ -3114,12 +3117,11 @@ static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
   }
 }
 
-static AOM_INLINE bool enable_palette(AV1_COMP *cpi, bool is_mode_intra,
-                                      BLOCK_SIZE bsize,
-                                      unsigned int source_variance,
-                                      int force_zeromv_skip,
-                                      int skip_idtx_palette,
-                                      int force_palette_test) {
+static inline bool enable_palette(AV1_COMP *cpi, bool is_mode_intra,
+                                  BLOCK_SIZE bsize,
+                                  unsigned int source_variance,
+                                  int force_zeromv_skip, int skip_idtx_palette,
+                                  int force_palette_test) {
   if (!cpi->oxcf.tool_cfg.enable_palette) return false;
   if (!av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                          bsize)) {
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 1bc706717d..a3e3fbf860 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -164,9 +164,9 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
 // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
 // new_height'. Extra rows and columns are filled in by copying last valid
 // row/column.
-static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
-                                                int orig_width, int orig_height,
-                                                int new_width, int new_height) {
+static inline void extend_palette_color_map(uint8_t *const color_map,
+                                            int orig_width, int orig_height,
+                                            int new_width, int new_height) {
   int j;
   assert(new_width >= orig_width);
   assert(new_height >= orig_height);
@@ -187,10 +187,9 @@ static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
 
 // Bias toward using colors in the cache.
 // TODO(huisu): Try other schemes to improve compression.
-static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
-                                               int n_cache, int n_colors,
-                                               int stride, int16_t *centroids,
-                                               int bit_depth) {
+static inline void optimize_palette_colors(uint16_t *color_cache, int n_cache,
+                                           int n_colors, int stride,
+                                           int16_t *centroids, int bit_depth) {
   if (n_cache <= 0) return;
   for (int i = 0; i < n_colors * stride; i += stride) {
     int min_diff = abs((int)centroids[i] - (int)color_cache[0]);
@@ -214,7 +213,7 @@ static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
  * Given the base colors as specified in centroids[], calculate the RD cost
  * of palette mode.
  */
-static AOM_INLINE void palette_rd_y(
+static inline void palette_rd_y(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids,
     int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
@@ -313,7 +312,7 @@ static AOM_INLINE void palette_rd_y(
   }
 }
 
-static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+static inline int is_iter_over(int curr_idx, int end_idx, int step_size) {
   assert(step_size != 0);
   return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
 }
@@ -322,7 +321,7 @@ static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
 // [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
 // be less than start_n. Saves the last numbers searched in last_n_searched and
 // returns the best number of colors found.
-static AOM_INLINE int perform_top_color_palette_search(
+static inline int perform_top_color_palette_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data,
     int16_t *top_colors, int start_n, int end_n, int step_size,
@@ -369,7 +368,7 @@ static AOM_INLINE int perform_top_color_palette_search(
 // [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
 // be less than start_n. Saves the last numbers searched in last_n_searched and
 // returns the best number of colors found.
-static AOM_INLINE int perform_k_means_palette_search(
+static inline int perform_k_means_palette_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound,
     int upper_bound, int start_n, int end_n, int step_size,
@@ -419,8 +418,8 @@ static AOM_INLINE int perform_k_means_palette_search(
 }
 
 // Sets the parameters to search the current number of colors +- 1
-static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
-                                         int winner, int end_n) {
+static inline void set_stage2_params(int *min_n, int *max_n, int *step_size,
+                                     int winner, int end_n) {
   // Set min to winner - 1 unless we are already at the border, then we set it
   // to winner + 1
   *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
@@ -435,12 +434,12 @@ static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
   *step_size = AOMMAX(1, *max_n - *min_n);
 }
 
-static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src,
-                                                const int src_stride,
-                                                const int rows, const int cols,
-                                                const int is_high_bitdepth,
-                                                int16_t *data, int *lower_bound,
-                                                int *upper_bound) {
+static inline void fill_data_and_get_bounds(const uint8_t *src,
+                                            const int src_stride,
+                                            const int rows, const int cols,
+                                            const int is_high_bitdepth,
+                                            int16_t *data, int *lower_bound,
+                                            int *upper_bound) {
   if (is_high_bitdepth) {
     const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
     *lower_bound = *upper_bound = src_ptr[0];
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 5033e5da4d..49294cbfdb 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -735,11 +735,9 @@ void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
  * corresponding to the best mode found.
  */
 
-static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
-                                                MACROBLOCK *const x,
-                                                RD_STATS *rd_cost,
-                                                BLOCK_SIZE bsize,
-                                                PICK_MODE_CONTEXT *ctx) {
+static inline void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x,
+                                            RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                            PICK_MODE_CONTEXT *ctx) {
   int use_rdopt = 0;
   const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode;
   // Use rd pick for intra mode search based on block size and variance.
@@ -764,10 +762,12 @@ static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
 // (ref mv list population) require the encoding of the top-right superblock to
 // be complete. So, here, we delay the waiting of threads until the need for the
 // data from the top-right superblock region.
-static AOM_INLINE void wait_for_top_right_sb(
-    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
-    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
-    BLOCK_SIZE bsize, int mi_row, int mi_col) {
+static inline void wait_for_top_right_sb(AV1EncRowMultiThreadInfo *enc_row_mt,
+                                         AV1EncRowMultiThreadSync *row_mt_sync,
+                                         TileInfo *tile_info,
+                                         BLOCK_SIZE sb_size,
+                                         int sb_mi_size_log2, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col) {
   const int sb_size_in_mi = mi_size_wide[sb_size];
   const int bw_in_mi = mi_size_wide[bsize];
   const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
@@ -1694,7 +1694,7 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static AOM_INLINE int is_adjust_var_based_part_enabled(
+static inline int is_adjust_var_based_part_enabled(
     AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
     BLOCK_SIZE bsize) {
   if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
@@ -3421,7 +3421,7 @@ static void set_partition_cost_for_edge_blk(
 
 // Reset the partition search state flags when
 // must_find_valid_partition is equal to 1.
-static AOM_INLINE void reset_part_limitations(
+static inline void reset_part_limitations(
     AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const int is_rect_part_allowed =
@@ -3483,7 +3483,7 @@ static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
 typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
 
 // Checks if HORZ / VERT partition search is allowed.
-static AOM_INLINE int is_rect_part_allowed(
+static inline int is_rect_part_allowed(
     const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
     const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
     const int mi_pos) {
@@ -3672,7 +3672,7 @@ static void rd_pick_ab_part(
 }
 
 // Set mode search context.
-static AOM_INLINE void set_mode_search_ctx(
+static inline void set_mode_search_ctx(
     PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
     PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
   mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
@@ -3688,7 +3688,7 @@ static AOM_INLINE void set_mode_search_ctx(
     mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
 }
 
-static AOM_INLINE void copy_partition_mode_from_mode_context(
+static inline void copy_partition_mode_from_mode_context(
     const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
   if (ctx && ctx->rd_stats.rate < INT_MAX) {
     *dst_mode = &ctx->mic;
@@ -3697,7 +3697,7 @@ static AOM_INLINE void copy_partition_mode_from_mode_context(
   }
 }
 
-static AOM_INLINE void copy_partition_mode_from_pc_tree(
+static inline void copy_partition_mode_from_pc_tree(
     const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
   if (pc_tree) {
     copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
@@ -3706,7 +3706,7 @@ static AOM_INLINE void copy_partition_mode_from_pc_tree(
   }
 }
 
-static AOM_INLINE void set_mode_cache_for_partition_ab(
+static inline void set_mode_cache_for_partition_ab(
     const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
     AB_PART_TYPE ab_part_type) {
   switch (ab_part_type) {
@@ -4249,8 +4249,8 @@ static void prune_partitions_after_split(
 
 // Returns true if either of the left and top neighbor blocks is larger than
 // the current block; false otherwise.
-static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk(
-    const MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+static inline bool is_neighbor_blk_larger_than_cur_blk(const MACROBLOCKD *xd,
+                                                       BLOCK_SIZE bsize) {
   const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]);
   if (xd->left_available) {
     const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize;
@@ -4268,7 +4268,7 @@ static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk(
   return false;
 }
 
-static AOM_INLINE void prune_rect_part_using_none_pred_mode(
+static inline void prune_rect_part_using_none_pred_mode(
     const MACROBLOCKD *xd, PartitionSearchState *part_state,
     PREDICTION_MODE mode, BLOCK_SIZE bsize) {
   if (mode == DC_PRED || mode == SMOOTH_PRED) {
@@ -5373,7 +5373,7 @@ bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
 }
 #endif  // CONFIG_PARTITION_SEARCH_ORDER
 
-static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
+static inline bool should_do_dry_run_encode_for_current_block(
     BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
     BLOCK_SIZE bsize) {
   if (bsize > max_partition_size) return false;
@@ -5425,8 +5425,8 @@ static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   *var_max = log1p(max_var_4x4 / 16.0);
 }
 
-static AOM_INLINE void set_sms_tree_partitioning(
-    SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) {
+static inline void set_sms_tree_partitioning(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                             PARTITION_TYPE partition) {
   if (sms_tree == NULL) return;
   sms_tree->partitioning = partition;
 }
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 5d09fd41c3..02e57fda5a 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h
@@ -58,16 +58,16 @@ bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                            SB_MULTI_PASS_MODE multi_pass_mode,
                            RD_RECT_PART_WIN_INFO *rect_part_win_info);
 
-static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
-                                      const uint16_t cb_offset_y,
-                                      const uint16_t cb_offset_uv) {
+static inline void set_cb_offsets(uint16_t *cb_offset,
+                                  const uint16_t cb_offset_y,
+                                  const uint16_t cb_offset_uv) {
   cb_offset[PLANE_TYPE_Y] = cb_offset_y;
   cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
 }
 
-static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
-                                         const int subsampling_x,
-                                         const int subsampling_y) {
+static inline void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+                                     const int subsampling_x,
+                                     const int subsampling_y) {
   x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
   if (x->e_mbd.is_chroma_ref) {
     const BLOCK_SIZE plane_bsize =
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 1474270b65..d7561135a2 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -31,7 +31,7 @@
 #include "av1/encoder/rdopt.h"
 
 #if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void simple_motion_search_prune_part_features(
+static inline void simple_motion_search_prune_part_features(
     AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
     int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
     int features_to_get);
@@ -521,7 +521,7 @@ static int simple_motion_search_get_best_ref(
 //  - whether a left marcoblock exists
 //  - width of left macroblock
 //  - height of left macroblock
-static AOM_INLINE void simple_motion_search_prune_part_features(
+static inline void simple_motion_search_prune_part_features(
     AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
     int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
     int features_to_get) {
@@ -964,8 +964,8 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
 
 // Get the minimum partition block width and height(in log scale) under a
 // SIMPLE_MOTION_DATA_TREE.
-static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
-                                     int *min_bw, int *min_bh) {
+static inline void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                 int *min_bw, int *min_bh) {
   if (!sms_tree) return;
 
   const BLOCK_SIZE bsize = sms_tree->block_size;
@@ -1723,7 +1723,7 @@ void av1_prune_partitions_before_search(AV1_COMP *const cpi,
 }
 
 #ifndef NDEBUG
-static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+static inline int is_bsize_square(BLOCK_SIZE bsize) {
   return block_size_wide[bsize] == block_size_high[bsize];
 }
 #endif  // NDEBUG
diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index 288c1c3735..2597dedfa0 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h
@@ -233,11 +233,11 @@ static BLOCK_SIZE dim_to_size(int dim) {
   }
 }
 
-static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
-                                                  AV1_COMP *cpi, MACROBLOCK *x,
-                                                  const SPEED_FEATURES *sf,
-                                                  BLOCK_SIZE sb_size,
-                                                  int mi_row, int mi_col) {
+static inline void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                              AV1_COMP *cpi, MACROBLOCK *x,
+                                              const SPEED_FEATURES *sf,
+                                              BLOCK_SIZE sb_size, int mi_row,
+                                              int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
 
   sb_enc->max_partition_size =
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 3b726243df..c8d4e992b2 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -680,11 +680,11 @@ void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
 //   pick_method: Search method used to select CDEF parameters
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
-                                        const YV12_BUFFER_CONFIG *ref,
-                                        AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        CdefSearchCtx *cdef_search_ctx,
-                                        CDEF_PICK_METHOD pick_method) {
+static inline void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+                                    const YV12_BUFFER_CONFIG *ref,
+                                    AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    CdefSearchCtx *cdef_search_ctx,
+                                    CDEF_PICK_METHOD pick_method) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   cdef_search_ctx->mi_params = &cm->mi_params;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 63756340dc..0c75d96aef 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -156,7 +156,7 @@ typedef struct {
   int16_t *src_avg;
 } RestSearchCtxt;
 
-static AOM_INLINE void rsc_on_tile(void *priv) {
+static inline void rsc_on_tile(void *priv) {
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   set_default_wiener(&rsc->ref_wiener);
   set_default_sgrproj(&rsc->ref_sgrproj);
@@ -164,16 +164,16 @@ static AOM_INLINE void rsc_on_tile(void *priv) {
   set_default_sgrproj(&rsc->switchable_ref_sgrproj);
 }
 
-static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
+static inline void reset_rsc(RestSearchCtxt *rsc) {
   memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
   memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
 }
 
-static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
-                                const AV1_COMMON *cm, const MACROBLOCK *x,
-                                const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
-                                int plane, RestUnitSearchInfo *rusi,
-                                YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
+static inline void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
+                            const MACROBLOCK *x,
+                            const LOOP_FILTER_SPEED_FEATURES *lpf_sf, int plane,
+                            RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
+                            RestSearchCtxt *rsc) {
   rsc->src = src;
   rsc->dst = dst;
   rsc->cm = cm;
@@ -469,10 +469,12 @@ static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
     return (dividend + divisor / 2) / divisor;
 }
 
-static AOM_INLINE void calc_proj_params_r0_r1_c(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+static inline void calc_proj_params_r0_r1_c(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt0, int flt0_stride,
+                                            int32_t *flt1, int flt1_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -499,7 +501,7 @@ static AOM_INLINE void calc_proj_params_r0_r1_c(
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+static inline void calc_proj_params_r0_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -529,12 +531,11 @@ static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
-                                             int height, int src_stride,
-                                             const uint8_t *dat8,
-                                             int dat_stride, int32_t *flt0,
-                                             int flt0_stride, int64_t H[2][2],
-                                             int64_t C[2]) {
+static inline void calc_proj_params_r0_c(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int32_t *flt0, int flt0_stride,
+                                         int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -553,7 +554,7 @@ static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+static inline void calc_proj_params_r0_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -575,12 +576,11 @@ static AOM_INLINE void calc_proj_params_r0_high_bd_c(
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
-                                             int height, int src_stride,
-                                             const uint8_t *dat8,
-                                             int dat_stride, int32_t *flt1,
-                                             int flt1_stride, int64_t H[2][2],
-                                             int64_t C[2]) {
+static inline void calc_proj_params_r1_c(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int32_t *flt1, int flt1_stride,
+                                         int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -599,7 +599,7 @@ static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+static inline void calc_proj_params_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -667,13 +667,12 @@ void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
-                                         int height, int src_stride,
-                                         const uint8_t *dat8, int dat_stride,
-                                         int use_highbitdepth, int32_t *flt0,
-                                         int flt0_stride, int32_t *flt1,
-                                         int flt1_stride, int *xq,
-                                         const sgr_params_type *params) {
+static inline void get_proj_subspace(const uint8_t *src8, int width, int height,
+                                     int src_stride, const uint8_t *dat8,
+                                     int dat_stride, int use_highbitdepth,
+                                     int32_t *flt0, int flt0_stride,
+                                     int32_t *flt1, int flt1_stride, int *xq,
+                                     const sgr_params_type *params) {
   int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
   int64_t C[2] = { 0, 0 };
 
@@ -740,8 +739,7 @@ static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
   }
 }
 
-static AOM_INLINE void encode_xq(int *xq, int *xqd,
-                                 const sgr_params_type *params) {
+static inline void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
   if (params->r[0] == 0) {
     xqd[0] = 0;
     xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
@@ -758,12 +756,11 @@ static AOM_INLINE void encode_xq(int *xq, int *xqd,
 }
 
 // Apply the self-guided filter across an entire restoration unit.
-static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
-                                 int width, int height, int dat_stride,
-                                 int use_highbd, int bit_depth, int pu_width,
-                                 int pu_height, int32_t *flt0, int32_t *flt1,
-                                 int flt_stride,
-                                 struct aom_internal_error_info *error_info) {
+static inline void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
+                             int height, int dat_stride, int use_highbd,
+                             int bit_depth, int pu_width, int pu_height,
+                             int32_t *flt0, int32_t *flt1, int flt_stride,
+                             struct aom_internal_error_info *error_info) {
   for (int i = 0; i < height; i += pu_height) {
     const int h = AOMMIN(pu_height, height - i);
     int32_t *flt0_row = flt0 + i * flt_stride;
@@ -784,7 +781,7 @@ static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
   }
 }
 
-static AOM_INLINE void compute_sgrproj_err(
+static inline void compute_sgrproj_err(
     const uint8_t *dat8, const int width, const int height,
     const int dat_stride, const uint8_t *src8, const int src_stride,
     const int use_highbitdepth, const int bit_depth, const int pu_width,
@@ -804,9 +801,9 @@ static AOM_INLINE void compute_sgrproj_err(
       flt_stride, flt1, flt_stride, 2, exqd, params);
 }
 
-static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
-                                      const int *exqd, int *bestxqd,
-                                      int *bestep, const int ep) {
+static inline void get_best_error(int64_t *besterr, const int64_t err,
+                                  const int *exqd, int *bestxqd, int *bestep,
+                                  const int ep) {
   if (*besterr == -1 || err < *besterr) {
     *bestep = ep;
     *besterr = err;
@@ -894,10 +891,10 @@ static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
   return bits;
 }
 
-static AOM_INLINE void search_sgrproj(
-    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
-    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
-    struct aom_internal_error_info *error_info) {
+static inline void search_sgrproj(const RestorationTileLimits *limits,
+                                  int rest_unit_idx, void *priv,
+                                  int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+                                  struct aom_internal_error_info *error_info) {
   (void)rlbs;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -1209,9 +1206,8 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
 }
 
 // Fix vector b, update vector a
-static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
-                                        int64_t **Hc, int32_t *a,
-                                        const int32_t *b) {
+static inline void update_a_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc,
+                                    int32_t *a, const int32_t *b) {
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
@@ -1280,9 +1276,8 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
 }
 
 // Fix vector a, update vector b
-static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
-                                        int64_t **Hc, const int32_t *a,
-                                        int32_t *b) {
+static inline void update_b_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc,
+                                    const int32_t *a, int32_t *b) {
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
@@ -1424,8 +1419,8 @@ static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
   return Score - iScore;
 }
 
-static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
-                                           InterpKernel fi) {
+static inline void finalize_sym_filter(int wiener_win, int32_t *f,
+                                       InterpKernel fi) {
   int i;
   const int wiener_halfwin = (wiener_win >> 1);
 
@@ -1602,10 +1597,10 @@ static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
   return err;
 }
 
-static AOM_INLINE void search_wiener(
-    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
-    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
-    struct aom_internal_error_info *error_info) {
+static inline void search_wiener(const RestorationTileLimits *limits,
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs,
+                                 struct aom_internal_error_info *error_info) {
   (void)tmpbuf;
   (void)rlbs;
   (void)error_info;
@@ -1748,7 +1743,7 @@ static AOM_INLINE void search_wiener(
   if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
 }
 
-static AOM_INLINE void search_norestore(
+static inline void search_norestore(
     const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
     int32_t *tmpbuf, RestorationLineBuffers *rlbs,
     struct aom_internal_error_info *error_info) {
@@ -1766,7 +1761,7 @@ static AOM_INLINE void search_norestore(
   rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
 }
 
-static AOM_INLINE void search_switchable(
+static inline void search_switchable(
     const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
     int32_t *tmpbuf, RestorationLineBuffers *rlbs,
     struct aom_internal_error_info *error_info) {
@@ -1846,9 +1841,9 @@ static AOM_INLINE void search_switchable(
     rsc->switchable_ref_sgrproj = rusi->sgrproj;
 }
 
-static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
-                                      const RestUnitSearchInfo *rusi,
-                                      RestorationUnitInfo *rui) {
+static inline void copy_unit_info(RestorationType frame_rtype,
+                                  const RestUnitSearchInfo *rusi,
+                                  RestorationUnitInfo *rui) {
   assert(frame_rtype > 0);
   rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
   if (rui->restoration_type == RESTORE_WIENER)
diff --git a/av1/encoder/rc_utils.h b/av1/encoder/rc_utils.h
index 35c98f4006..8d807af920 100644
--- a/av1/encoder/rc_utils.h
+++ b/av1/encoder/rc_utils.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+static inline void check_reset_rc_flag(AV1_COMP *cpi) {
   RATE_CONTROL *rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   if (cpi->common.current_frame.frame_number >
@@ -38,8 +38,8 @@ static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
   }
 }
 
-static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
-                                                   AV1_PRIMARY *ppi) {
+static inline void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+                                               AV1_PRIMARY *ppi) {
   PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
@@ -61,8 +61,8 @@ static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
   p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
 }
 
-static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
-                                           AV1_LEVEL target_level, int tier) {
+static inline void config_target_level(AV1_COMP *const cpi,
+                                       AV1_LEVEL target_level, int tier) {
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
   SequenceHeader *const seq_params = cpi->common.seq_params;
   TileConfig *const tile_cfg = &oxcf->tile_cfg;
@@ -125,9 +125,8 @@ static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
  * \retval        1           Recode Required
  * \retval        0           No Recode required
  */
-static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
-                                       int low_limit, int q, int maxq,
-                                       int minq) {
+static inline int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit,
+                                   int q, int maxq, int minq) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
@@ -153,9 +152,9 @@ static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
   return force_recode;
 }
 
-static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
-                                                             double max_factor,
-                                                             int frame_count) {
+static inline double av1_get_gfu_boost_projection_factor(double min_factor,
+                                                         double max_factor,
+                                                         int frame_count) {
   double factor = sqrt((double)frame_count);
   factor = AOMMIN(factor, max_factor);
   factor = AOMMAX(factor, min_factor);
@@ -163,16 +162,16 @@ static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
   return factor;
 }
 
-static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
-                                                double max_factor, double r0,
-                                                int frames_to_key) {
+static inline int get_gfu_boost_from_r0_lap(double min_factor,
+                                            double max_factor, double r0,
+                                            int frames_to_key) {
   double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
                                                       frames_to_key);
   const int boost = (int)rint(factor / r0);
   return boost;
 }
 
-static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+static inline double av1_get_kf_boost_projection_factor(int frame_count) {
   double factor = sqrt((double)frame_count);
   factor = AOMMIN(factor, 10.0);
   factor = AOMMAX(factor, 4.0);
@@ -180,10 +179,10 @@ static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
   return factor;
 }
 
-static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
-                                                int is_encode_stage, int q_low,
-                                                int q_high, int top_index,
-                                                int bottom_index) {
+static inline int get_regulated_q_overshoot(AV1_COMP *const cpi,
+                                            int is_encode_stage, int q_low,
+                                            int q_high, int top_index,
+                                            int bottom_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
 
@@ -206,10 +205,9 @@ static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
   return q_regulated;
 }
 
-static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
-                                                 int is_encode_stage,
-                                                 int q_high, int top_index,
-                                                 int bottom_index) {
+static inline int get_regulated_q_undershoot(AV1_COMP *const cpi,
+                                             int is_encode_stage, int q_high,
+                                             int top_index, int bottom_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
 
@@ -251,7 +249,7 @@ static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
  * \param[in]     loop_count      Loop itterations so far.
  *
  */
-static AOM_INLINE void recode_loop_update_q(
+static inline void recode_loop_update_q(
     AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
     int *const q_high, const int top_index, const int bottom_index,
     int *const undershoot_seen, int *const overshoot_seen,
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index e209693bb7..403d12ac4c 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -728,7 +728,7 @@ static inline int is_frame_level_cost_upd_freq_set(
 
 // Decide whether we want to update the mode entropy cost for the current frame.
 // The logit is currently inherited from selective_disable_cdf_rtc.
-static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) {
+static inline int should_force_mode_cost_update(const AV1_COMP *cpi) {
   const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
   if (!rt_sf->frame_level_mode_cost_update) {
     return false;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 90a13602b9..e315aeecde 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -443,9 +443,9 @@ void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
   }
 }
 
-static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
-                                            BLOCK_SIZE bsize, int64_t sse,
-                                            int64_t dist, int residue_cost) {
+static inline void inter_mode_data_push(TileDataEnc *tile_data,
+                                        BLOCK_SIZE bsize, int64_t sse,
+                                        int64_t dist, int residue_cost) {
   if (residue_cost == 0 || sse == dist) return;
   const int block_idx = inter_mode_data_block_idx(bsize);
   if (block_idx == -1) return;
@@ -461,12 +461,11 @@ static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
   }
 }
 
-static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
-                                             int mode_rate, int64_t sse,
-                                             int64_t rd, RD_STATS *rd_cost,
-                                             RD_STATS *rd_cost_y,
-                                             RD_STATS *rd_cost_uv,
-                                             const MB_MODE_INFO *mbmi) {
+static inline void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                         int mode_rate, int64_t sse, int64_t rd,
+                                         RD_STATS *rd_cost, RD_STATS *rd_cost_y,
+                                         RD_STATS *rd_cost_uv,
+                                         const MB_MODE_INFO *mbmi) {
   const int num = inter_modes_info->num;
   assert(num < MAX_INTER_MODES);
   inter_modes_info->mbmi_arr[num] = *mbmi;
@@ -496,8 +495,8 @@ static int compare_rd_idx_pair(const void *a, const void *b) {
   }
 }
 
-static AOM_INLINE void inter_modes_info_sort(
-    const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
+static inline void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+                                         RdIdxPair *rd_idx_pair_arr) {
   if (inter_modes_info->num == 0) {
     return;
   }
@@ -736,7 +735,7 @@ static inline PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
                  : compound_ref0_mode(this_mode);
 }
 
-static AOM_INLINE void estimate_ref_frame_costs(
+static inline void estimate_ref_frame_costs(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
     int segment_id, unsigned int *ref_costs_single,
     unsigned int (*ref_costs_comp)[REF_FRAMES]) {
@@ -898,7 +897,7 @@ static AOM_INLINE void estimate_ref_frame_costs(
   }
 }
 
-static AOM_INLINE void store_coding_context(
+static inline void store_coding_context(
 #if CONFIG_INTERNAL_STATS
     MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
 #else
@@ -919,7 +918,7 @@ static AOM_INLINE void store_coding_context(
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
 }
 
-static AOM_INLINE void setup_buffer_ref_mvs_inter(
+static inline void setup_buffer_ref_mvs_inter(
     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
@@ -2160,8 +2159,8 @@ typedef struct motion_mode_best_st_candidate {
 
 // Checks if the current reference frame matches with neighbouring block's
 // (top/left) reference frames
-static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
-                                                   MB_MODE_INFO *nb_mbmi) {
+static inline int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+                                               MB_MODE_INFO *nb_mbmi) {
   MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
                                           nb_mbmi->ref_frame[1] };
   MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
@@ -2177,8 +2176,8 @@ static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
   return match_found;
 }
 
-static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
-                                                  MACROBLOCKD *xd) {
+static inline int find_ref_match_in_above_nbs(const int total_mi_cols,
+                                              MACROBLOCKD *xd) {
   if (!xd->up_available) return 1;
   const int mi_col = xd->mi_col;
   MB_MODE_INFO **cur_mbmi = xd->mi;
@@ -2199,8 +2198,8 @@ static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
   return 0;
 }
 
-static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
-                                                 MACROBLOCKD *xd) {
+static inline int find_ref_match_in_left_nbs(const int total_mi_rows,
+                                             MACROBLOCKD *xd) {
   if (!xd->left_available) return 1;
   const int mi_row = xd->mi_row;
   MB_MODE_INFO **cur_mbmi = xd->mi;
@@ -2238,7 +2237,7 @@ typedef struct {
 
 #if !CONFIG_REALTIME_ONLY
 // TODO(Remya): Check if get_tpl_stats_b() can be reused
-static AOM_INLINE void get_block_level_tpl_stats(
+static inline void get_block_level_tpl_stats(
     AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
     PruneInfoFromTpl *inter_cost_info_from_tpl) {
   AV1_COMMON *const cm = &cpi->common;
@@ -2293,7 +2292,7 @@ static AOM_INLINE void get_block_level_tpl_stats(
 }
 #endif
 
-static AOM_INLINE int prune_modes_based_on_tpl_stats(
+static inline int prune_modes_based_on_tpl_stats(
     PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
     const PREDICTION_MODE this_mode, int prune_mode_level) {
   const int have_newmv = have_newmv_in_inter_mode(this_mode);
@@ -2500,9 +2499,10 @@ static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
  *                                       prune_zero_mv_with_sse value
  * \return Returns 1 if zero_mv is pruned, 0 otherwise.
  */
-static AOM_INLINE int prune_zero_mv_with_sse(
-    const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
-    const HandleInterModeArgs *args, int prune_zero_mv_with_sse) {
+static inline int prune_zero_mv_with_sse(const aom_variance_fn_ptr_t *fn_ptr,
+                                         const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         const HandleInterModeArgs *args,
+                                         int prune_zero_mv_with_sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
 
@@ -2576,9 +2576,9 @@ static AOM_INLINE int prune_zero_mv_with_sse(
  * \param[in]     bsize             The current block_size.
  * \return Returns true if a predictor is built in xd->dst, false otherwise.
  */
-static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                          int mi_row, int mi_col,
-                                          BLOCK_SIZE bsize) {
+static inline bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
   static const InterpFilters filters_ref_set[3] = {
     { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
     { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
@@ -3378,12 +3378,12 @@ void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
   av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
 
-static AOM_INLINE void calc_target_weighted_pred(
+static inline void calc_target_weighted_pred(
     const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
     const uint8_t *above, int above_stride, const uint8_t *left,
     int left_stride);
 
-static AOM_INLINE void rd_pick_skip_mode(
+static inline void rd_pick_skip_mode(
     RD_STATS *rd_cost, InterModeSearchState *search_state,
     const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
@@ -3513,7 +3513,7 @@ static AOM_INLINE void rd_pick_skip_mode(
 }
 
 // Get winner mode stats of given mode index
-static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+static inline MB_MODE_INFO *get_winner_mode_stats(
     MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
     int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
     RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
@@ -3544,7 +3544,7 @@ static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
 // When this speed feature is on, in rd mode search, only DCT is used.
 // After the mode is determined, this function is called, to select
 // transform types and get accurate rdcost.
-static AOM_INLINE void refine_winner_mode_tx(
+static inline void refine_winner_mode_tx(
     const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
     PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
     MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
@@ -3698,7 +3698,7 @@ typedef struct {
 /*!\endcond */
 
 // Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
-static AOM_INLINE void disable_reference(
+static inline void disable_reference(
     MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
   for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
     ref_combo[ref][ref2 + 1] = true;
@@ -3706,7 +3706,7 @@ static AOM_INLINE void disable_reference(
 }
 
 // Update 'ref_combo' mask to disable all inter references except ALTREF.
-static AOM_INLINE void disable_inter_references_except_altref(
+static inline void disable_inter_references_except_altref(
     bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
   disable_reference(LAST_FRAME, ref_combo);
   disable_reference(LAST2_FRAME, ref_combo);
@@ -3729,8 +3729,7 @@ static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
 
 typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
 
-static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
-                                         REF_SET ref_set) {
+static inline void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) {
   if (ref_set == REF_SET_FULL) {
     // Everything available by default.
     memset(mask, 0, sizeof(*mask));
@@ -3768,9 +3767,9 @@ static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
   }
 }
 
-static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
-                                           const AV1_COMP *cpi, MACROBLOCK *x,
-                                           BLOCK_SIZE bsize) {
+static inline void init_mode_skip_mask(mode_skip_mask_t *mask,
+                                       const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3940,9 +3939,9 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
   }
 }
 
-static AOM_INLINE void init_neighbor_pred_buf(
-    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
-    int is_hbd) {
+static inline void init_neighbor_pred_buf(const OBMCBuffer *const obmc_buffer,
+                                          HandleInterModeArgs *const args,
+                                          int is_hbd) {
   if (is_hbd) {
     const int len = sizeof(uint16_t);
     args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
@@ -3965,8 +3964,8 @@ static AOM_INLINE void init_neighbor_pred_buf(
   }
 }
 
-static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                      MV_REFERENCE_FRAME ref_frame) {
+static inline int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  MV_REFERENCE_FRAME ref_frame) {
   const AV1_COMMON *const cm = &cpi->common;
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
@@ -3981,8 +3980,8 @@ static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
   return 0;
 }
 
-static AOM_INLINE int is_ref_frame_used_by_compound_ref(
-    int ref_frame, int skip_ref_frame_mask) {
+static inline int is_ref_frame_used_by_compound_ref(int ref_frame,
+                                                    int skip_ref_frame_mask) {
   for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
     if (!(skip_ref_frame_mask & (1 << r))) {
       const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
@@ -3994,8 +3993,8 @@ static AOM_INLINE int is_ref_frame_used_by_compound_ref(
   return 0;
 }
 
-static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
-                                                 const MB_MODE_INFO *mi_cache) {
+static inline int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+                                             const MB_MODE_INFO *mi_cache) {
   if (!mi_cache) {
     return 0;
   }
@@ -4012,7 +4011,7 @@ static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
 
 // Please add/modify parameter setting in this function, making it consistent
 // and easy to read and maintain.
-static AOM_INLINE void set_params_rd_pick_inter_mode(
+static inline void set_params_rd_pick_inter_mode(
     const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
     BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
     unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
@@ -4146,7 +4145,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   }
 }
 
-static AOM_INLINE void init_single_inter_mode_search_state(
+static inline void init_single_inter_mode_search_state(
     InterModeSearchState *search_state) {
   for (int dir = 0; dir < 2; ++dir) {
     for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
@@ -4174,7 +4173,7 @@ static AOM_INLINE void init_single_inter_mode_search_state(
   av1_zero(search_state->single_state_modelled_cnt);
 }
 
-static AOM_INLINE void init_inter_mode_search_state(
+static inline void init_inter_mode_search_state(
     InterModeSearchState *search_state, const AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
   init_intra_mode_search_state(&search_state->intra_search_state);
@@ -4495,9 +4494,9 @@ static inline void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
   set_default_interp_filters(mbmi, cm->features.interp_filter);
 }
 
-static AOM_INLINE void collect_single_states(MACROBLOCK *x,
-                                             InterModeSearchState *search_state,
-                                             const MB_MODE_INFO *const mbmi) {
+static inline void collect_single_states(MACROBLOCK *x,
+                                         InterModeSearchState *search_state,
+                                         const MB_MODE_INFO *const mbmi) {
   int i, j;
   const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   const PREDICTION_MODE this_mode = mbmi->mode;
@@ -4541,8 +4540,8 @@ static AOM_INLINE void collect_single_states(MACROBLOCK *x,
   search_state->single_state_modelled_cnt[dir][mode_offset]++;
 }
 
-static AOM_INLINE void analyze_single_states(
-    const AV1_COMP *cpi, InterModeSearchState *search_state) {
+static inline void analyze_single_states(const AV1_COMP *cpi,
+                                         InterModeSearchState *search_state) {
   const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
   assert(prune_level >= 1);
   int i, j, dir, mode;
@@ -4865,7 +4864,7 @@ static inline void update_search_state(
 
 // Find the best RD for a reference frame (among single reference modes)
 // and store +10% of it in the 0-th element in ref_frame_rd.
-static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+static inline void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
   assert(ref_frame_rd[0] == INT64_MAX);
   int64_t ref_copy[REF_FRAMES - 1];
   memcpy(ref_copy, ref_frame_rd + 1,
@@ -4890,7 +4889,7 @@ static inline bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
          ref_frame_rd[frame2] <= ref_frame_rd[0];
 }
 
-static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+static inline void evaluate_motion_mode_for_winner_candidates(
     const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
     HandleInterModeArgs *const args, TileDataEnc *const tile_data,
     PICK_MODE_CONTEXT *const ctx,
@@ -5413,7 +5412,7 @@ static void handle_winner_cand(
  * correspondingly. While x is also modified, it is only used as a temporary
  * buffer, and the final decisions are stored in search_state.
  */
-static AOM_INLINE void search_intra_modes_in_interframe(
+static inline void search_intra_modes_in_interframe(
     InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
     RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
     const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
@@ -5597,9 +5596,11 @@ static AOM_INLINE void search_intra_modes_in_interframe(
 #if !CONFIG_REALTIME_ONLY
 // Prepare inter_cost and intra_cost from TPL stats, which are used as ML
 // features in intra mode pruning.
-static AOM_INLINE void calculate_cost_from_tpl_data(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-    int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+static inline void calculate_cost_from_tpl_data(const AV1_COMP *cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int mi_row, int mi_col,
+                                                int64_t *inter_cost,
+                                                int64_t *intra_cost) {
   const AV1_COMMON *const cm = &cpi->common;
   // Only consider full SB.
   const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
@@ -5634,7 +5635,7 @@ static AOM_INLINE void calculate_cost_from_tpl_data(
 
 // When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
 // intra mode search.
-static AOM_INLINE void skip_intra_modes_in_interframe(
+static inline void skip_intra_modes_in_interframe(
     AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
     InterModeSearchState *search_state, const SPEED_FEATURES *const sf,
     int64_t inter_cost, int64_t intra_cost) {
@@ -5701,8 +5702,8 @@ static AOM_INLINE void skip_intra_modes_in_interframe(
   }
 }
 
-static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
-                                                 int is_single_pred) {
+static inline bool skip_interp_filter_search(const AV1_COMP *cpi,
+                                             int is_single_pred) {
   const MODE encoding_mode = cpi->oxcf.mode;
   if (encoding_mode == REALTIME) {
     return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
@@ -5715,9 +5716,8 @@ static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
   return false;
 }
 
-static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
-                                         const MACROBLOCK *x,
-                                         BLOCK_SIZE bsize) {
+static inline int get_block_temp_var(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                     BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
 
@@ -6526,7 +6526,7 @@ static inline void calc_target_weighted_pred_left(
 //  error(x, y) =
 //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
 //
-static AOM_INLINE void calc_target_weighted_pred(
+static inline void calc_target_weighted_pred(
     const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
     const uint8_t *above, int above_stride, const uint8_t *left,
     int left_stride) {
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 2376bafffe..5214800c83 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -239,17 +239,16 @@ static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = {
   3   // MULTI_WINNER_MODE_DEFAULT
 };
 
-static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
-                                       const int num_planes) {
+static inline void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+                                   const int num_planes) {
   for (int i = 0; i < num_planes; i++) {
     xd->plane[i].dst.buf = dst.plane[i];
     xd->plane[i].dst.stride = dst.stride[i];
   }
 }
 
-static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
-                                    const BUFFER_SET *dst_bufs[2],
-                                    int num_planes) {
+static inline void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
   const BUFFER_SET *buf0 = dst_bufs[0];
   dst_bufs[0] = dst_bufs[1];
   dst_bufs[1] = buf0;
@@ -258,9 +257,9 @@ static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
 
 /* clang-format on */
 // Calculate rd threshold based on ref best rd and relevant scaling factors
-static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
-                                                     int mul_factor,
-                                                     int div_factor) {
+static inline int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+                                                 int mul_factor,
+                                                 int div_factor) {
   int64_t rd_thresh = ref_best_rd;
   if (div_factor != 0) {
     rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
@@ -270,9 +269,9 @@ static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
   return rd_thresh;
 }
 
-static AOM_INLINE THR_MODES
-get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
-                        MV_REFERENCE_FRAME second_ref_frame) {
+static inline THR_MODES get_prediction_mode_idx(
+    PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+    MV_REFERENCE_FRAME second_ref_frame) {
   if (this_mode < INTRA_MODE_END) {
     assert(ref_frame == INTRA_FRAME);
     assert(second_ref_frame == NONE_FRAME);
@@ -296,7 +295,7 @@ get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
   return THR_INVALID;
 }
 
-static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+static inline int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
   if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
       bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
     return -1;
@@ -305,12 +304,11 @@ static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
 }
 
 // Get transform block visible dimensions cropped to the MI units.
-static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
-                                          BLOCK_SIZE plane_bsize, int blk_row,
-                                          int blk_col, BLOCK_SIZE tx_bsize,
-                                          int *width, int *height,
-                                          int *visible_width,
-                                          int *visible_height) {
+static inline void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                                      BLOCK_SIZE plane_bsize, int blk_row,
+                                      int blk_col, BLOCK_SIZE tx_bsize,
+                                      int *width, int *height,
+                                      int *visible_width, int *visible_height) {
   assert(tx_bsize <= plane_bsize);
   const int txb_height = block_size_high[tx_bsize];
   const int txb_width = block_size_wide[tx_bsize];
@@ -341,7 +339,7 @@ static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
   if (width) *width = txb_width;
 }
 
-static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+static inline int bsize_to_num_blk(BLOCK_SIZE bsize) {
   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
   return num_blk;
 }
@@ -668,7 +666,7 @@ static inline CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
   return is_cfl_allowed(xd);
 }
 
-static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+static inline void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[1] = 0;
 }
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index afb59f59ed..0396603ca1 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -28,7 +28,7 @@
 #include "av1/common/reconintra.h"
 #include "av1/encoder/reconinter_enc.h"
 
-static AOM_INLINE void enc_calc_subpel_params(
+static inline void enc_calc_subpel_params(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
     uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
   struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
diff --git a/av1/encoder/sorting_network.h b/av1/encoder/sorting_network.h
index 2705aab91e..cffe9a45be 100644
--- a/av1/encoder/sorting_network.h
+++ b/av1/encoder/sorting_network.h
@@ -41,7 +41,7 @@
  * \param[in,out]    v          An length-16 array of int32 serves as the
  *                              value.
  */
-static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+static inline void av1_sort_fi32_16(float k[], int32_t v[]) {
   SWAP(0, 1);
   SWAP(2, 3);
   SWAP(4, 5);
@@ -115,7 +115,7 @@ static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
  * \param[in,out]    k          An length-8 array of float serves as the keys.
  * \param[in,out]    v          An length-8 array of int32 serves as the values.
  */
-static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+static inline void av1_sort_fi32_8(float k[], int32_t v[]) {
   SWAP(0, 1);
   SWAP(2, 3);
   SWAP(4, 5);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 82d8fdc908..2cc06598fa 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -169,7 +169,7 @@ static int frame_is_boosted(const AV1_COMP *cpi) {
 }
 
 // Set transform rd gate level for all transform search cases.
-static AOM_INLINE void set_txfm_rd_gate_level(
+static inline void set_txfm_rd_gate_level(
     int txfm_rd_gate_level[TX_SEARCH_CASES], int level) {
   assert(level <= MAX_TX_RD_GATE_LEVEL);
   for (int idx = 0; idx < TX_SEARCH_CASES; idx++)
@@ -1959,7 +1959,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
   }
 }
 
-static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+static inline void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   // best quality defaults
   hl_sf->frame_parameter_update = 1;
   hl_sf->recode_loop = ALLOW_RECODE;
@@ -1975,14 +1975,14 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   hl_sf->allow_sub_blk_me_in_tf = 0;
 }
 
-static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+static inline void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
   fp_sf->reduce_mv_step_param = 3;
   fp_sf->skip_motion_search_threshold = 0;
   fp_sf->disable_recon = 0;
   fp_sf->skip_zeromv_motion_search = 0;
 }
 
-static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+static inline void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
   tpl_sf->gop_length_decision_method = 0;
   tpl_sf->prune_intra_modes = 0;
   tpl_sf->prune_starting_mv = 0;
@@ -1997,7 +1997,7 @@ static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
   tpl_sf->reduce_num_frames = 0;
 }
 
-static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+static inline void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
   gm_sf->gm_search_type = GM_FULL_SEARCH;
   gm_sf->prune_ref_frame_for_gm_search = 0;
   gm_sf->prune_zero_mv_with_sse = 0;
@@ -2006,7 +2006,7 @@ static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
   gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS;
 }
 
-static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+static inline void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->partition_search_type = SEARCH_PARTITION;
   part_sf->less_rectangular_check_level = 0;
   part_sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -2053,7 +2053,7 @@ static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->disable_8x8_part_based_on_qidx = 0;
 }
 
-static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+static inline void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
   mv_sf->full_pixel_search_level = 0;
   mv_sf->auto_mv_step_size = 0;
   mv_sf->exhaustive_searches_thresh = 0;
@@ -2077,7 +2077,7 @@ static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
   mv_sf->use_intrabc = 1;
 }
 
-static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+static inline void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
   inter_sf->adaptive_rd_thresh = 0;
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
@@ -2128,7 +2128,7 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
   set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0);
 }
 
-static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+static inline void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
   interp_sf->adaptive_interp_filter_search = 0;
   interp_sf->cb_pred_filter_search = 0;
   interp_sf->disable_dual_filter = 0;
@@ -2138,7 +2138,7 @@ static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
   interp_sf->skip_interp_filter_search = 0;
 }
 
-static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+static inline void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
   intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
   intra_sf->skip_intra_in_interframe = 1;
   intra_sf->intra_pruning_with_hog = 0;
@@ -2162,7 +2162,7 @@ static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
   intra_sf->prune_luma_odd_delta_angles_in_intra = 0;
 }
 
-static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+static inline void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
   tx_sf->inter_tx_size_search_init_depth_sqr = 0;
   tx_sf->inter_tx_size_search_init_depth_rect = 0;
   tx_sf->intra_tx_size_search_init_depth_rect = 0;
@@ -2187,8 +2187,8 @@ static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
   tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
 }
 
-static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
-                                  const AV1EncoderConfig *oxcf) {
+static inline void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+                              const AV1EncoderConfig *oxcf) {
   const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
   if (disable_trellis_quant == 3) {
     rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
@@ -2216,7 +2216,7 @@ static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
   rd_sf->perform_coeff_opt = 0;
 }
 
-static AOM_INLINE void init_winner_mode_sf(
+static inline void init_winner_mode_sf(
     WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
   winner_mode_sf->motion_mode_for_winner_cand = 0;
   // Set this at the appropriate speed levels
@@ -2230,7 +2230,7 @@ static AOM_INLINE void init_winner_mode_sf(
   winner_mode_sf->prune_winner_mode_eval_level = 0;
 }
 
-static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+static inline void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->disable_loop_restoration_chroma = 0;
   lpf_sf->disable_loop_restoration_luma = 0;
   lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
@@ -2251,7 +2251,7 @@ static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->use_downsampled_wiener_stats = 0;
 }
 
-static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+static inline void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
   rt_sf->check_intra_pred_nonrd = 0;
   rt_sf->skip_intra_pred = 0;
   rt_sf->estimate_motion_for_var_based_partition = 0;
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 39b13fa1ee..36ecc2f10a 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -361,9 +361,8 @@ int av1_get_q(const struct AV1_COMP *cpi);
 //   is_high_bitdepth: Whether the frame is high-bitdepth or not.
 // Returns:
 //   True if allocation is successful and false otherwise.
-static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
-                                               int num_pels,
-                                               int is_high_bitdepth) {
+static inline bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+                                           int num_pels, int is_high_bitdepth) {
   tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
   tf_data->accum =
       (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
@@ -390,9 +389,9 @@ static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
 //   scale: Scaling factor.
 // Returns:
 //   Nothing will be returned. Contents of mbd will be modified.
-static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
-                                            TemporalFilterData *tf_data,
-                                            const struct scale_factors *scale) {
+static inline void tf_setup_macroblockd(MACROBLOCKD *mbd,
+                                        TemporalFilterData *tf_data,
+                                        const struct scale_factors *scale) {
   mbd->block_ref_scale_factors[0] = scale;
   mbd->block_ref_scale_factors[1] = scale;
   mbd->mi = &tf_data->tmp_mbmi;
@@ -405,8 +404,8 @@ static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
 //   is_high_bitdepth: Whether the frame is high-bitdepth or not.
 // Returns:
 //   Nothing will be returned.
-static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
-                                       int is_high_bitdepth) {
+static inline void tf_dealloc_data(TemporalFilterData *tf_data,
+                                   int is_high_bitdepth) {
   if (is_high_bitdepth)
     tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
   aom_free(tf_data->tmp_mbmi);
diff --git a/av1/encoder/thirdpass.c b/av1/encoder/thirdpass.c
index 3ac063676d..5a1e7b0015 100644
--- a/av1/encoder/thirdpass.c
+++ b/av1/encoder/thirdpass.c
@@ -329,7 +329,7 @@ static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
 }
 #endif
 
-static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
+static inline void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
   int cur_idx = 0;
   while (cur_idx < ctx->gop_info.num_frames) {
     assert(cur_idx < MAX_THIRD_PASS_BUF);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 04094789a7..87278583af 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -27,7 +27,7 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
+static inline int av1_fast_palette_color_index_context_on_edge(
     const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
   const bool has_left = (c - 1 >= 0);
   const bool has_above = (r - 1 >= 0);
@@ -75,8 +75,9 @@ static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
 
 // A faster version of av1_get_palette_color_index_context used by the encoder
 // exploiting the fact that the encoder does not need to maintain a color order.
-static AOM_INLINE int av1_fast_palette_color_index_context(
-    const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+static inline int av1_fast_palette_color_index_context(const uint8_t *color_map,
+                                                       int stride, int r, int c,
+                                                       int *color_idx) {
   assert(r > 0 || c > 0);
 
   const bool has_above = (r - 1 >= 0);
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index 3679293319..66c60f9c40 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -119,8 +119,8 @@ static inline unsigned int get_token_alloc(int mb_rows, int mb_cols,
 }
 
 // Allocate memory for token related info.
-static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
-                                        unsigned int tokens_required) {
+static inline void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
+                                    unsigned int tokens_required) {
   int sb_rows =
       CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
   token_info->tokens_allocated = tokens_required;
@@ -136,13 +136,13 @@ static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
 }
 
 // Check if memory allocation has been done for token related info.
-static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) {
+static inline bool is_token_info_allocated(const TokenInfo *token_info) {
   return ((token_info->tile_tok[0][0] != NULL) &&
           (token_info->tplist[0][0] != NULL));
 }
 
 // Free memory from token related variables.
-static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+static inline void free_token_info(TokenInfo *token_info) {
   aom_free(token_info->tile_tok[0][0]);
   token_info->tile_tok[0][0] = NULL;
 
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 339e8a5d51..9747aea8b3 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -91,19 +91,18 @@ void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) {
   }
 }
 
-static AOM_INLINE void av1_tpl_store_txfm_stats(
-    TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
-    const int frame_index) {
+static inline void av1_tpl_store_txfm_stats(TplParams *tpl_data,
+                                            const TplTxfmStats *tpl_txfm_stats,
+                                            const int frame_index) {
   tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
 }
 #endif  // CONFIG_BITRATE_ACCURACY
 
-static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
-                                          const tran_low_t *coeff,
-                                          tran_low_t *qcoeff,
-                                          tran_low_t *dqcoeff, TX_SIZE tx_size,
-                                          uint16_t *eob, int64_t *recon_error,
-                                          int64_t *sse) {
+static inline void get_quantize_error(const MACROBLOCK *x, int plane,
+                                      const tran_low_t *coeff,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      TX_SIZE tx_size, uint16_t *eob,
+                                      int64_t *recon_error, int64_t *sse) {
   const struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *xd = &x->e_mbd;
   const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
@@ -137,8 +136,8 @@ static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
   *sse = AOMMAX(*sse, 1);
 }
 
-static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
-                                                uint8_t *tpl_bsize_1d) {
+static inline void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+                                            uint8_t *tpl_bsize_1d) {
   // tpl stats bsize: 2 means 16x16
   *block_mis_log2 = 2;
   // Block size used in tpl motion estimation
@@ -200,12 +199,11 @@ void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
   }
 }
 
-static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
-                                            int16_t *src_diff, int diff_stride,
-                                            const uint8_t *src, int src_stride,
-                                            const uint8_t *dst, int dst_stride,
-                                            tran_low_t *coeff, int bw, int bh,
-                                            TX_SIZE tx_size) {
+static inline int32_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff,
+                                        int diff_stride, const uint8_t *src,
+                                        int src_stride, const uint8_t *dst,
+                                        int dst_stride, tran_low_t *coeff,
+                                        int bw, int bh, TX_SIZE tx_size) {
   const int pix_num = bw * bh;
 
   av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
@@ -228,7 +226,7 @@ static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
   return (rate_cost << AV1_PROB_COST_SHIFT);
 }
 
-static AOM_INLINE void txfm_quant_rdcost(
+static inline void txfm_quant_rdcost(
     const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
     int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
     tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
@@ -456,13 +454,13 @@ static void get_rate_distortion(
   }
 }
 
-static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
-                                         const uint8_t *src_mb_buffer,
-                                         int src_stride,
-                                         TplBuffers *tpl_tmp_buffers,
-                                         BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                         int mi_row, int mi_col, int rf_idx,
-                                         MV *rfidx_mv, int use_pred_sad) {
+static inline int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+                                     const uint8_t *src_mb_buffer,
+                                     int src_stride,
+                                     TplBuffers *tpl_tmp_buffers,
+                                     BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                     int mi_row, int mi_col, int rf_idx,
+                                     MV *rfidx_mv, int use_pred_sad) {
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
   TplParams *tpl_data = &cpi->ppi->tpl_data;
   const YV12_BUFFER_CONFIG *const ref_frame_ptr =
@@ -521,12 +519,10 @@ static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
   return inter_cost;
 }
 
-static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
-                                       TplTxfmStats *tpl_txfm_stats,
-                                       TplBuffers *tpl_tmp_buffers,
-                                       MACROBLOCK *x, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                       TplDepStats *tpl_stats) {
+static inline void mode_estimation(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+                                   TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   TX_SIZE tx_size, TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
@@ -1160,9 +1156,9 @@ int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
   return rate_cost;
 }
 
-static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
-                                          int mi_col, const BLOCK_SIZE bsize,
-                                          int frame_idx, int ref) {
+static inline void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+                                      int mi_col, const BLOCK_SIZE bsize,
+                                      int frame_idx, int ref) {
   TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
   TplDepFrame *tpl_frame = tpl_data->tpl_frame;
@@ -1236,8 +1232,8 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
   }
 }
 
-static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
-                                        int mi_col, int frame_idx) {
+static inline void tpl_model_update(TplParams *const tpl_data, int mi_row,
+                                    int mi_col, int frame_idx) {
   const BLOCK_SIZE tpl_stats_block_size =
       convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
   tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
@@ -1246,10 +1242,10 @@ static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
                      1);
 }
 
-static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
-                                       int mi_col, int stride,
-                                       const TplDepStats *src_stats,
-                                       uint8_t block_mis_log2) {
+static inline void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+                                   int mi_col, int stride,
+                                   const TplDepStats *src_stats,
+                                   uint8_t block_mis_log2) {
   int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2);
   TplDepStats *tpl_ptr = &tpl_stats_ptr[index];
   *tpl_ptr = *src_stats;
@@ -1267,21 +1263,21 @@ static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
 }
 
 // Reset the ref and source frame pointers of tpl_data.
-static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+static inline void tpl_reset_src_ref_frames(TplParams *tpl_data) {
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     tpl_data->ref_frame[i] = NULL;
     tpl_data->src_ref_frame[i] = NULL;
   }
 }
 
-static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+static inline int get_gop_length(const GF_GROUP *gf_group) {
   int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
   return gop_length;
 }
 
 // Initialize the mc_flow parameters used in computing tpl data.
-static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
-                                              int pframe_qindex) {
+static inline void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                          int pframe_qindex) {
   TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
@@ -1469,7 +1465,7 @@ void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
   }
 }
 
-static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+static inline void mc_flow_dispenser(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   ThreadData *td = &cpi->td;
@@ -1509,7 +1505,7 @@ static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
   }
 }
 
-static AOM_INLINE void init_gop_frames_for_tpl(
+static inline void init_gop_frames_for_tpl(
     AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
     GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) {
   AV1_COMMON *cm = &cpi->common;
@@ -1741,7 +1737,7 @@ int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
   return tpl_data->tpl_frame[gf_frame_index].is_valid;
 }
 
-static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+static inline int eval_gop_length(double *beta, int gop_eval) {
   switch (gop_eval) {
     case 1:
       // Allow larger GOP size if the base layer ARF has higher dependency
@@ -1781,10 +1777,9 @@ void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
   }
 }
 
-static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
-                                         int frame_idx, int gop_eval,
-                                         int approx_gop_eval,
-                                         int reduce_num_frames) {
+static inline int skip_tpl_for_frame(const GF_GROUP *gf_group, int frame_idx,
+                                     int gop_eval, int approx_gop_eval,
+                                     int reduce_num_frames) {
   // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
   // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
   // tpl stats calculation is limited to ARFs from base layer and (base+1)
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 6a1299cba9..434bf2d2b8 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -416,7 +416,7 @@ void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
                            CommonModeInfoParams *const mi_params, int width,
                            int height, int byte_alignment, int lag_in_frames);
 
-static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+static inline void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
   aom_free(tpl_tmp_buffers->predictor8);
   tpl_tmp_buffers->predictor8 = NULL;
   aom_free(tpl_tmp_buffers->src_diff);
@@ -429,8 +429,8 @@ static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
   tpl_tmp_buffers->dqcoeff = NULL;
 }
 
-static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
-                                              uint8_t tpl_bsize_1d) {
+static inline bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+                                          uint8_t tpl_bsize_1d) {
   // Number of pixels in a tpl block
   const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
 
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 9b03becc11..7e425ed463 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -189,12 +189,12 @@ static double residual_frame_average_variance(AV1_COMP *cpi,
 }
 
 // TODO(sdeng): Add the SIMD implementation.
-static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
-                                           int source_stride,
-                                           const uint16_t *blurred,
-                                           int blurred_stride, uint16_t *dst,
-                                           int dst_stride, int w, int h,
-                                           double amount, int bit_depth) {
+static inline void highbd_unsharp_rect(const uint16_t *source,
+                                       int source_stride,
+                                       const uint16_t *blurred,
+                                       int blurred_stride, uint16_t *dst,
+                                       int dst_stride, int w, int h,
+                                       double amount, int bit_depth) {
   const int max_value = (1 << bit_depth) - 1;
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
@@ -208,10 +208,10 @@ static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
   }
 }
 
-static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
-                                    const uint8_t *blurred, int blurred_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    double amount) {
+static inline void unsharp_rect(const uint8_t *source, int source_stride,
+                                const uint8_t *blurred, int blurred_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                double amount) {
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
       const double val =
@@ -224,10 +224,10 @@ static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
   }
 }
 
-static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
-                               const YV12_BUFFER_CONFIG *source,
-                               const YV12_BUFFER_CONFIG *blurred,
-                               const YV12_BUFFER_CONFIG *dst, double amount) {
+static inline void unsharp(const AV1_COMP *const cpi,
+                           const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *blurred,
+                           const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   if (cpi->common.seq_params->use_highbitdepth) {
     assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -251,9 +251,9 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
 // _mm_loadu_si128() in prepare_coeffs_6t().
 DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0,  8, 30, 52,
                                                                30, 8, 0,  0 };
-static AOM_INLINE void gaussian_blur(const int bit_depth,
-                                     const YV12_BUFFER_CONFIG *source,
-                                     const YV12_BUFFER_CONFIG *dst) {
+static inline void gaussian_blur(const int bit_depth,
+                                 const YV12_BUFFER_CONFIG *source,
+                                 const YV12_BUFFER_CONFIG *dst) {
   const int block_size = BLOCK_128X128;
   const int block_w = mi_size_wide[block_size] * 4;
   const int block_h = mi_size_high[block_size] * 4;
@@ -290,7 +290,7 @@ static AOM_INLINE void gaussian_blur(const int bit_depth,
   }
 }
 
-static AOM_INLINE double cal_approx_vmaf(
+static inline double cal_approx_vmaf(
     const AV1_COMP *const cpi, double source_variance,
     const YV12_BUFFER_CONFIG *const source,
     const YV12_BUFFER_CONFIG *const sharpened) {
@@ -775,9 +775,9 @@ void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
 }
 
 // TODO(sdeng): replace them with the SIMD versions.
-static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
-                                            const uint16_t *ref, int ref_stride,
-                                            int w, int h) {
+static inline double highbd_image_sad_c(const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        int w, int h) {
   double accum = 0.0;
   int i, j;
 
@@ -793,9 +793,9 @@ static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
   return accum / (double)(h * w);
 }
 
-static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride, int w,
-                                     int h) {
+static inline double image_sad_c(const uint8_t *src, int src_stride,
+                                 const uint8_t *ref, int ref_stride, int w,
+                                 int h) {
   double accum = 0.0;
   int i, j;
 
@@ -880,9 +880,9 @@ static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
   return AOMMIN(motion1, motion2);
 }
 
-static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
-                                           const YV12_BUFFER_CONFIG **last,
-                                           const YV12_BUFFER_CONFIG **next) {
+static inline void get_neighbor_frames(const AV1_COMP *const cpi,
+                                       const YV12_BUFFER_CONFIG **last,
+                                       const YV12_BUFFER_CONFIG **next) {
   const AV1_COMMON *const cm = &cpi->common;
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const int src_index =
@@ -952,7 +952,7 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   return qindex;
 }
 
-static AOM_INLINE double cal_approx_score(
+static inline double cal_approx_score(
     AV1_COMP *const cpi, double src_variance, double new_variance,
     double src_score, const YV12_BUFFER_CONFIG *const src,
     const YV12_BUFFER_CONFIG *const recon_sharpened) {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 8225fe35e7..e3fd12991f 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -105,10 +105,9 @@ static inline int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
   return match_index;
 }
 
-static AOM_INLINE void fetch_mb_rd_info(int n4,
-                                        const MB_RD_INFO *const mb_rd_info,
-                                        RD_STATS *const rd_stats,
-                                        MACROBLOCK *const x) {
+static inline void fetch_mb_rd_info(int n4, const MB_RD_INFO *const mb_rd_info,
+                                    RD_STATS *const rd_stats,
+                                    MACROBLOCK *const x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   mbmi->tx_size = mb_rd_info->tx_size;
@@ -239,8 +238,8 @@ static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
 }
 
 // Used to set proper context for early termination with skip = 1.
-static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
-                                     BLOCK_SIZE bsize, int64_t dist) {
+static inline void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
+                                 BLOCK_SIZE bsize, int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int n4 = bsize_to_num_blk(bsize);
@@ -279,10 +278,10 @@ static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
                    (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
 }
 
-static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash,
-                                       const MACROBLOCK *const x,
-                                       const RD_STATS *const rd_stats,
-                                       MB_RD_RECORD *mb_rd_record) {
+static inline void save_mb_rd_info(int n4, uint32_t hash,
+                                   const MACROBLOCK *const x,
+                                   const RD_STATS *const rd_stats,
+                                   MB_RD_RECORD *mb_rd_record) {
   int index;
   if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) {
     index =
@@ -327,7 +326,7 @@ static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
   }
 }
 
-static AOM_INLINE void select_tx_block(
+static inline void select_tx_block(
     const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
@@ -340,7 +339,7 @@ static AOM_INLINE void select_tx_block(
 // 2: Collect RD stats for partition units
 #if CONFIG_COLLECT_RD_STATS
 
-static AOM_INLINE void get_energy_distribution_fine(
+static inline void get_energy_distribution_fine(
     const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
     const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
     double *verdist) {
@@ -470,7 +469,7 @@ static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
   return sum / (w * h);
 }
 
-static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+static inline void get_2x2_normalized_sses_and_sads(
     const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
     int src_stride, const uint8_t *const dst, int dst_stride,
     const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
@@ -533,7 +532,7 @@ static double get_mean(const int16_t *diff, int stride, int w, int h) {
   assert(w > 0 && h > 0);
   return sum / (w * h);
 }
-static AOM_INLINE void PrintTransformUnitStats(
+static inline void PrintTransformUnitStats(
     const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
     int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
     TX_TYPE tx_type, int64_t rd) {
@@ -714,11 +713,11 @@ static double get_diff_mean(const uint8_t *src, int src_stride,
   return sum / (w * h);
 }
 
-static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
-                                                const TileDataEnc *tile_data,
-                                                MACROBLOCK *x,
-                                                const RD_STATS *const rd_stats,
-                                                BLOCK_SIZE plane_bsize) {
+static inline void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+                                            const TileDataEnc *tile_data,
+                                            MACROBLOCK *x,
+                                            const RD_STATS *const rd_stats,
+                                            BLOCK_SIZE plane_bsize) {
   if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
 
   if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
@@ -842,11 +841,10 @@ static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
 #endif  // CONFIG_COLLECT_RD_STATS >= 2
 #endif  // CONFIG_COLLECT_RD_STATS
 
-static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
-                                                      int plane, int block,
-                                                      int blk_row, int blk_col,
-                                                      int eob,
-                                                      int reduced_tx_set) {
+static inline void inverse_transform_block_facade(MACROBLOCK *const x,
+                                                  int plane, int block,
+                                                  int blk_row, int blk_col,
+                                                  int eob, int reduced_tx_set) {
   if (!eob) return;
   struct macroblock_plane *const p = &x->plane[plane];
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1401,10 +1399,10 @@ static inline float get_adaptive_thresholds(
   return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
 }
 
-static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
-                                                     int stride, int bw, int bh,
-                                                     float *hordist,
-                                                     float *verdist) {
+static inline void get_energy_distribution_finer(const int16_t *diff,
+                                                 int stride, int bw, int bh,
+                                                 float *hordist,
+                                                 float *verdist) {
   // First compute downscaled block energy values (esq); downscale factors
   // are defined by w_shift and h_shift.
   unsigned int esq[256];
@@ -1465,15 +1463,15 @@ static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
   for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
 }
 
-static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+static inline bool check_bit_mask(uint16_t mask, int val) {
   return mask & (1 << val);
 }
 
-static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+static inline void set_bit_mask(uint16_t *mask, int val) {
   *mask |= (1 << val);
 }
 
-static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+static inline void unset_bit_mask(uint16_t *mask, int val) {
   *mask &= ~(1 << val);
 }
 
@@ -1645,8 +1643,8 @@ static float get_dev(float mean, double x2_sum, int num) {
 // Returns the number of elements written to the output array which is at most
 // 12 currently. Hence 'features' buffer should be able to accommodate at least
 // 12 elements.
-static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride,
-                                            int bw, int bh, float *features) {
+static inline int get_mean_dev_features(const int16_t *data, int stride, int bw,
+                                        int bh, float *features) {
   const int16_t *const data_ptr = &data[0];
   const int subh = (bh >= bw) ? (bh >> 1) : bh;
   const int subw = (bw >= bh) ? (bw >> 1) : bw;
@@ -2326,12 +2324,11 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
 // Pick transform type for a luma transform block of tx_size. Note this function
 // is used only for inter-predicted blocks.
-static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
-                                  TX_SIZE tx_size, int blk_row, int blk_col,
-                                  int block, int plane_bsize, TXB_CTX *txb_ctx,
-                                  RD_STATS *rd_stats,
-                                  FAST_TX_SEARCH_MODE ftxs_mode,
-                                  int64_t ref_rdcost) {
+static inline void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+                              TX_SIZE tx_size, int blk_row, int blk_col,
+                              int block, int plane_bsize, TXB_CTX *txb_ctx,
+                              RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode,
+                              int64_t ref_rdcost) {
   assert(is_inter_block(x->e_mbd.mi[0]));
   RD_STATS this_rd_stats;
   const int skip_trellis = 0;
@@ -2341,7 +2338,7 @@ static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 }
 
-static AOM_INLINE void try_tx_block_no_split(
+static inline void try_tx_block_no_split(
     const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
     const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
@@ -2392,7 +2389,7 @@ static AOM_INLINE void try_tx_block_no_split(
       xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 }
 
-static AOM_INLINE void try_tx_block_split(
+static inline void try_tx_block_split(
     const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
@@ -2451,9 +2448,9 @@ static float get_var(float mean, double x2_sum, int num) {
   return diff;
 }
 
-static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
-                                       int bh, float *dev_of_mean,
-                                       float *var_of_vars) {
+static inline void get_blk_var_dev(const int16_t *data, int stride, int bw,
+                                   int bh, float *dev_of_mean,
+                                   float *var_of_vars) {
   const int16_t *const data_ptr = &data[0];
   const int subh = (bh >= bw) ? (bh >> 1) : bh;
   const int subw = (bw >= bh) ? (bw >> 1) : bw;
@@ -2538,7 +2535,7 @@ static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
 // Search for the best transform partition(recursive)/type for a given
 // inter-predicted luma block. The obtained transform selection will be saved
 // in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
-static AOM_INLINE void select_tx_block(
+static inline void select_tx_block(
     const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
@@ -2646,10 +2643,9 @@ static AOM_INLINE void select_tx_block(
   }
 }
 
-static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
-                                              MACROBLOCK *x, RD_STATS *rd_stats,
-                                              int64_t ref_best_rd,
-                                              BLOCK_SIZE bs) {
+static inline void choose_largest_tx_size(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x, RD_STATS *rd_stats,
+                                          int64_t ref_best_rd, BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
@@ -2743,11 +2739,9 @@ static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
                        mbmi->tx_size, FTXS_NONE, skip_trellis);
 }
 
-static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
-                                               MACROBLOCK *x,
-                                               RD_STATS *rd_stats,
-                                               int64_t ref_best_rd,
-                                               BLOCK_SIZE bs) {
+static inline void choose_smallest_tx_size(const AV1_COMP *const cpi,
+                                           MACROBLOCK *x, RD_STATS *rd_stats,
+                                           int64_t ref_best_rd, BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
 
@@ -2825,11 +2819,11 @@ static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row,
 #endif  // !CONFIG_REALTIME_ONLY
 
 // Search for the best uniform transform size and type for current coding block.
-static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
-                                                   MACROBLOCK *x,
-                                                   RD_STATS *rd_stats,
-                                                   int64_t ref_best_rd,
-                                                   BLOCK_SIZE bs) {
+static inline void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                               MACROBLOCK *x,
+                                               RD_STATS *rd_stats,
+                                               int64_t ref_best_rd,
+                                               BLOCK_SIZE bs) {
   av1_invalid_rd_stats(rd_stats);
 
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2928,9 +2922,9 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
 
 // Search for the best transform type for the given transform block in the
 // given plane/channel, and calculate the corresponding RD cost.
-static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
-                                     int blk_col, BLOCK_SIZE plane_bsize,
-                                     TX_SIZE tx_size, void *arg) {
+static inline void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
   struct rdcost_block_args *args = arg;
   if (args->exit_early) {
     args->incomplete_exit = 1;
@@ -3197,12 +3191,14 @@ int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 // Search for the best transform type for a luma inter-predicted block, given
 // the transform block partitions.
 // This function is used only when some speed features are enabled.
-static AOM_INLINE void tx_block_yrd(
-    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
-    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
-    ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
-    TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
-    RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+static inline void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                                int blk_col, int block, TX_SIZE tx_size,
+                                BLOCK_SIZE plane_bsize, int depth,
+                                ENTROPY_CONTEXT *above_ctx,
+                                ENTROPY_CONTEXT *left_ctx,
+                                TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                                int64_t ref_best_rd, RD_STATS *rd_stats,
+                                FAST_TX_SEARCH_MODE ftxs_mode) {
   assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3464,10 +3460,9 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Return 1 to terminate transform search early. The decision is made based on
 // the comparison with the reference RD cost and the model-estimated RD cost.
-static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
-                                                  MACROBLOCK *x,
-                                                  BLOCK_SIZE bsize,
-                                                  int64_t ref_best_rd) {
+static inline int model_based_tx_search_prune(const AV1_COMP *cpi,
+                                              MACROBLOCK *x, BLOCK_SIZE bsize,
+                                              int64_t ref_best_rd) {
   const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
   assert(level >= 0 && level <= 2);
   int model_rate;
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index 6b826a5d70..78efdb42da 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -33,8 +33,8 @@ enum {
   FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
 } UENUM1BYTE(FAST_TX_SEARCH_MODE);
 
-static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                   TX_SIZE tx_size) {
+static inline int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                               TX_SIZE tx_size) {
   assert(bsize == x->e_mbd.mi[0]->bsize);
   if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
       !block_signals_txsize(bsize))
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 8f7924a162..666f80b9a9 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -48,8 +48,8 @@ typedef struct {
   VPartVar *split[4];
 } variance_node;
 
-static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
-                                    variance_node *node) {
+static inline void tree_to_node(void *data, BLOCK_SIZE bsize,
+                                variance_node *node) {
   node->part_variances = NULL;
   switch (bsize) {
     case BLOCK_128X128: {
@@ -99,14 +99,13 @@ static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
 }
 
 // Set variance values given sum square error, sum error, count.
-static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
-                                     VPartVar *v) {
+static inline void fill_variance(uint32_t s2, int32_t s, int c, VPartVar *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static AOM_INLINE void get_variance(VPartVar *v) {
+static inline void get_variance(VPartVar *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
                    (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -114,14 +113,14 @@ static AOM_INLINE void get_variance(VPartVar *v) {
             v->log2_count);
 }
 
-static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
-                                       VPartVar *r) {
+static inline void sum_2_variances(const VPartVar *a, const VPartVar *b,
+                                   VPartVar *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
 }
 
-static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+static inline void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   variance_node node;
   memset(&node, 0, sizeof(node));
   tree_to_node(data, bsize, &node);
@@ -133,8 +132,8 @@ static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
                   &node.part_variances->none);
 }
 
-static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row,
-                                      int mi_col, BLOCK_SIZE bsize) {
+static inline void set_block_size(AV1_COMP *const cpi, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize) {
   if (cpi->common.mi_params.mi_cols > mi_col &&
       cpi->common.mi_params.mi_rows > mi_row) {
     CommonModeInfoParams *mi_params = &cpi->common.mi_params;
@@ -252,8 +251,8 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd,
   return 0;
 }
 
-static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
-                                      int pixels_high) {
+static inline int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
+                                  int pixels_high) {
   int all_inside = 1;
   for (int idx = 0; idx < 4; idx++) {
     all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide);
@@ -264,7 +263,7 @@ static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd
-static AOM_INLINE void fill_variance_8x8avg_highbd(
+static inline void fill_variance_8x8avg_highbd(
     const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
     int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
     int pixels_high) {
@@ -287,7 +286,7 @@ static AOM_INLINE void fill_variance_8x8avg_highbd(
 }
 #endif
 
-static AOM_INLINE void fill_variance_8x8avg_lowbd(
+static inline void fill_variance_8x8avg_lowbd(
     const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
     int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
     int pixels_high) {
@@ -327,10 +326,11 @@ static AOM_INLINE void fill_variance_8x8avg_lowbd(
 // at 8x8 sub-block level for a given 16x16 block.
 // The function can be called only when is_key_frame is false since sum is
 // computed between source and reference frames.
-static AOM_INLINE void fill_variance_8x8avg(
-    const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
-    int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag,
-    int pixels_wide, int pixels_high) {
+static inline void fill_variance_8x8avg(const uint8_t *src_buf, int src_stride,
+                                        const uint8_t *dst_buf, int dst_stride,
+                                        int x16_idx, int y16_idx, VP16x16 *vst,
+                                        int highbd_flag, int pixels_wide,
+                                        int pixels_high) {
 #if CONFIG_AV1_HIGHBITDEPTH
   if (highbd_flag) {
     fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride,
@@ -386,14 +386,13 @@ static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride,
 // Function to compute average and variance of 4x4 sub-block.
 // The function can be called only when is_key_frame is true since sum is
 // computed using source frame only.
-static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf,
-                                            int src_stride, int x8_idx,
-                                            int y8_idx, VP8x8 *vst,
+static inline void fill_variance_4x4avg(const uint8_t *src_buf, int src_stride,
+                                        int x8_idx, int y8_idx, VP8x8 *vst,
 #if CONFIG_AV1_HIGHBITDEPTH
-                                            int highbd_flag,
+                                        int highbd_flag,
 #endif
-                                            int pixels_wide, int pixels_high,
-                                            int border_offset_4x4) {
+                                        int pixels_wide, int pixels_high,
+                                        int border_offset_4x4) {
   for (int idx = 0; idx < 4; idx++) {
     const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2);
     const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2);
@@ -437,7 +436,7 @@ static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
 }
 
 // Tune thresholds less or more aggressively to prefer larger partitions
-static AOM_INLINE void tune_thresh_based_on_qindex(
+static inline void tune_thresh_based_on_qindex(
     AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex,
     int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd,
     int lighting_change) {
@@ -562,7 +561,7 @@ static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[],
   thresholds[4] = threshold_base << 2;
 }
 
-static AOM_INLINE void tune_thresh_based_on_resolution(
+static inline void tune_thresh_based_on_resolution(
     AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base,
     int current_qindex, int source_sad_rd, int num_pixels) {
   if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1;
@@ -628,11 +627,11 @@ static AOM_INLINE void tune_thresh_based_on_resolution(
 }
 
 // Increase the base partition threshold, based on content and noise level.
-static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
-                                                   int64_t threshold_base,
-                                                   int content_lowsumdiff,
-                                                   int source_sad_nonrd,
-                                                   int num_pixels) {
+static inline int64_t tune_base_thresh_content(AV1_COMP *cpi,
+                                               int64_t threshold_base,
+                                               int content_lowsumdiff,
+                                               int source_sad_nonrd,
+                                               int num_pixels) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t updated_thresh_base = threshold_base;
   if (cpi->noise_estimate.enabled && content_lowsumdiff &&
@@ -654,10 +653,12 @@ static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
   return updated_thresh_base;
 }
 
-static AOM_INLINE void set_vbp_thresholds(
-    AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex,
-    int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd,
-    bool is_segment_id_boosted, int lighting_change) {
+static inline void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
+                                      uint64_t blk_sad, int qindex,
+                                      int content_lowsumdiff,
+                                      int source_sad_nonrd, int source_sad_rd,
+                                      bool is_segment_id_boosted,
+                                      int lighting_change) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -689,10 +690,11 @@ static AOM_INLINE void set_vbp_thresholds(
 
 // Set temporal variance low flag for superblock 64x64.
 // Only first 25 in the array are used in this case.
-static AOM_INLINE void set_low_temp_var_flag_64x64(
-    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
-    MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
-    int mi_row) {
+static inline void set_low_temp_var_flag_64x64(CommonModeInfoParams *mi_params,
+                                               PartitionSearchInfo *part_info,
+                                               MACROBLOCKD *xd, VP64x64 *vt,
+                                               const int64_t thresholds[],
+                                               int mi_col, int mi_row) {
   if (xd->mi[0]->bsize == BLOCK_64X64) {
     if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
       part_info->variance_low[0] = 1;
@@ -741,7 +743,7 @@ static AOM_INLINE void set_low_temp_var_flag_64x64(
   }
 }
 
-static AOM_INLINE void set_low_temp_var_flag_128x128(
+static inline void set_low_temp_var_flag_128x128(
     CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
     MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
     int mi_row) {
@@ -826,7 +828,7 @@ static AOM_INLINE void set_low_temp_var_flag_128x128(
   }
 }
 
-static AOM_INLINE void set_low_temp_var_flag(
+static inline void set_low_temp_var_flag(
     AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
     VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
     int mi_col, int mi_row, const bool is_small_sb) {
@@ -984,11 +986,10 @@ void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex,
   }
 }
 
-static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, unsigned int y_sad,
-                                    unsigned int y_sad_g,
-                                    unsigned int y_sad_alt, bool is_key_frame,
-                                    bool zero_motion, unsigned int *uv_sad) {
+static inline void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                unsigned int y_sad, unsigned int y_sad_g,
+                                unsigned int y_sad_alt, bool is_key_frame,
+                                bool zero_motion, unsigned int *uv_sad) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
   int shift_upper_limit = 1;
@@ -1208,7 +1209,7 @@ static void fill_variance_tree_leaves(
   }
 }
 
-static AOM_INLINE void set_ref_frame_for_partition(
+static inline void set_ref_frame_for_partition(
     AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi,
     unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt,
@@ -1250,10 +1251,9 @@ static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0,
   return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col);
 }
 
-static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
-                                              unsigned int *y_sad,
-                                              bool is_small_sb,
-                                              int est_motion) {
+static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
+                                          unsigned int *y_sad, bool is_small_sb,
+                                          int est_motion) {
   const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
   // TODO(yunqingwang@google.com): test if this condition works with other
   // speeds.
@@ -1490,7 +1490,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
 
 // Decides whether to split or merge a 16x16 partition block in variance based
 // partitioning based on the 8x8 sub-block variances.
-static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+static inline PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
     VP16x16 *var_16x16_info, int64_t threshold16) {
   int max_8x8_var = 0, min_8x8_var = INT_MAX;
   for (int split_idx = 0; split_idx < 4; split_idx++) {
@@ -1509,7 +1509,7 @@ static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
              : PART_EVAL_ONLY_NONE;
 }
 
-static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
+static inline bool is_set_force_zeromv_skip_based_on_src_sad(
     int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) {
   if (set_zeromv_skip_based_on_source_sad == 0) return false;
 
@@ -1523,7 +1523,7 @@ static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
   return false;
 }
 
-static AOM_INLINE bool set_force_zeromv_skip_for_sb(
+static inline bool set_force_zeromv_skip_for_sb(
     AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt,
     unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad,
     BLOCK_SIZE bsize) {
diff --git a/av1/encoder/x86/error_intrin_sse2.c b/av1/encoder/x86/error_intrin_sse2.c
index 9aa58e9ab9..6100cf1a32 100644
--- a/av1/encoder/x86/error_intrin_sse2.c
+++ b/av1/encoder/x86/error_intrin_sse2.c
@@ -15,7 +15,7 @@
 
 #include "aom/aom_integer.h"
 
-static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+static inline __m128i reduce_sum_epi64(__m128i reg) {
   __m128i reg_hi = _mm_srli_si128(reg, 8);
   reg = _mm_add_epi64(reg, reg_hi);
 
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index e5d5dead86..d54dadfb24 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c
@@ -245,7 +245,7 @@ void av1_nn_predict_sse3(const float *input_nodes,
 
 // Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
 // Function. Neural Computation, 11(4):853â€“862, 1999.
-static AOM_INLINE __m128 approx_exp(__m128 y) {
+static inline __m128 approx_exp(__m128 y) {
 #define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
 #define B \
   127  // Offset for the exponent according to IEEE floating point standard.
@@ -261,7 +261,7 @@ static AOM_INLINE __m128 approx_exp(__m128 y) {
 #undef C
 }
 
-static AOM_INLINE __m128 reduce_max(__m128 reg) {
+static inline __m128 reduce_max(__m128 reg) {
   __m128 tmp_reg;
 
   tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
@@ -273,7 +273,7 @@ static AOM_INLINE __m128 reduce_max(__m128 reg) {
   return reg;
 }
 
-static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+static inline __m128 reduce_sum(__m128 reg) {
   __m128 tmp_reg;
 
   tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 56cbfe44ec..f5375dd296 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -1679,7 +1679,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
 
 // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
 // C and H need to be computed.
-static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+static inline void calc_proj_params_r0_r1_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -1772,12 +1772,11 @@ static AOM_INLINE void calc_proj_params_r0_r1_avx2(
 
 // When only params->r[0] > 0. In this case only H[0][0] and C[0] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
-                                                int height, int src_stride,
-                                                const uint8_t *dat8,
-                                                int dat_stride, int32_t *flt0,
-                                                int flt0_stride,
-                                                int64_t H[2][2], int64_t C[2]) {
+static inline void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt0, int flt0_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -1831,12 +1830,11 @@ static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
 
 // When only params->r[1] > 0. In this case only H[1][1] and C[1] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
-                                                int height, int src_stride,
-                                                const uint8_t *dat8,
-                                                int dat_stride, int32_t *flt1,
-                                                int flt1_stride,
-                                                int64_t H[2][2], int64_t C[2]) {
+static inline void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt1, int flt1_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -1909,7 +1907,7 @@ void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+static inline void calc_proj_params_r0_r1_high_bd_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -2000,7 +1998,7 @@ static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
   C[1] /= size;
 }
 
-static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+static inline void calc_proj_params_r0_high_bd_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -2055,7 +2053,7 @@ static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
   C[0] /= size;
 }
 
-static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+static inline void calc_proj_params_r1_high_bd_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
     int64_t H[2][2], int64_t C[2]) {
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 0e155afcc7..1bd12e9e02 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -860,7 +860,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
 
 // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
 // C and H need to be computed.
-static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+static inline void calc_proj_params_r0_r1_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -947,10 +947,12 @@ static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
 
 // When only params->r[0] > 0. In this case only H[0][0] and C[0] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r0_sse4_1(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int64_t H[2][2], int64_t C[2]) {
+static inline void calc_proj_params_r0_sse4_1(const uint8_t *src8, int width,
+                                              int height, int src_stride,
+                                              const uint8_t *dat8,
+                                              int dat_stride, int32_t *flt0,
+                                              int flt0_stride, int64_t H[2][2],
+                                              int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -999,10 +1001,12 @@ static AOM_INLINE void calc_proj_params_r0_sse4_1(
 
 // When only params->r[1] > 0. In this case only H[1][1] and C[1] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r1_sse4_1(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
-    int64_t H[2][2], int64_t C[2]) {
+static inline void calc_proj_params_r1_sse4_1(const uint8_t *src8, int width,
+                                              int height, int src_stride,
+                                              const uint8_t *dat8,
+                                              int dat_stride, int32_t *flt1,
+                                              int flt1_stride, int64_t H[2][2],
+                                              int64_t C[2]) {
   const int size = width * height;
   const uint8_t *src = src8;
   const uint8_t *dat = dat8;
@@ -1071,7 +1075,7 @@ void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+static inline void calc_proj_params_r0_r1_high_bd_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
@@ -1158,7 +1162,7 @@ static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
 
 // When only params->r[0] > 0. In this case only H[0][0] and C[0] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+static inline void calc_proj_params_r0_high_bd_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     int64_t H[2][2], int64_t C[2]) {
@@ -1210,7 +1214,7 @@ static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
 
 // When only params->r[1] > 0. In this case only H[1][1] and C[1] are
 // non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+static inline void calc_proj_params_r1_high_bd_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
     int64_t H[2][2], int64_t C[2]) {
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index d8868f5208..f3d85e123b 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -327,7 +327,7 @@ static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
 }
 
 // AVX2 implementation of approx_exp()
-static AOM_INLINE __m256 approx_exp_avx2(__m256 y) {
+static inline __m256 approx_exp_avx2(__m256 y) {
 #define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
 #define B \
   127  // Offset for the exponent according to IEEE floating point standard.
diff --git a/tools/auto_refactor/av1_preprocess.py b/tools/auto_refactor/av1_preprocess.py
index 0bcf7f6106..a2e25859a6 100644
--- a/tools/auto_refactor/av1_preprocess.py
+++ b/tools/auto_refactor/av1_preprocess.py
@@ -66,8 +66,6 @@ def get_av1_pp_command(fake_header_dir, code_file_list):
                                                                          " "
                                                                          "-D'AV1_K_MEANS_DIM=2'"
                                                                          " "
-                                                                         "-D'AOM_INLINE='"
-                                                                         " "
                                                                          "-D'AOM_FORCE_INLINE='"
                                                                          " "
                                                                          "-D'inline='"
-- 
GitLab


From fb1f90e0a4abd920fb373207ad5141a4cef37444 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 12 Aug 2024 16:15:46 -0700
Subject: [PATCH 342/391] Remove the bit_depth field of struct av1_extracfg

It is not used.

Change-Id: Ic384e62b91c39c6f9ab5f7980951b839ec6c5b2b
---
 av1/av1_cx_iface.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 4471bfee47..9b3af757e5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -101,7 +101,6 @@ struct av1_extracfg {
   int deltaq_strength;
   int deltalf_mode;
   unsigned int frame_periodic_boost;
-  aom_bit_depth_t bit_depth;
   aom_tune_content content;
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;
@@ -281,7 +280,6 @@ static const struct av1_extracfg default_extra_cfg = {
   100,                          // deltaq_strength
   0,                            // delta lf mode
   0,                            // frame_periodic_boost
-  AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
   AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
   AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
@@ -433,7 +431,6 @@ static const struct av1_extracfg default_extra_cfg = {
   100,                          // deltaq_strength
   0,                            // delta lf mode
   0,                            // frame_periodic_boost
-  AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
   AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
   AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
-- 
GitLab


From 6b3edf76bac3ba8ec1ca355de337fdfec8ea8496 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 12 Aug 2024 16:47:49 -0700
Subject: [PATCH 343/391] Compare g_usage with AOM_USAGE_ALL_INTRA

Compare priv->cfg.g_usage with AOM_USAGE_ALL_INTRA, not ALLINTRA.
AOM_USAGE_ALL_INTRA is a public macro and should be used with the public
struct member g_usage. ALLINTRA is an internal enum constant.

Change-Id: I76424b2d4e5b593b08f573acd53cd2e0b1fa3459
---
 av1/av1_cx_iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 9b3af757e5..a13ba518af 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2844,7 +2844,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
     // Here we set its default value to 0 when --allintra is turned on.
     // However, if users set --enable-cdef = 1 from command line,
     // The encoder still respects it.
-    if (priv->cfg.g_usage == ALLINTRA) {
+    if (priv->cfg.g_usage == AOM_USAGE_ALL_INTRA) {
       priv->extra_cfg.enable_cdef = 0;
     }
     av1_initialize_enc(priv->cfg.g_usage, priv->cfg.rc_end_usage);
-- 
GitLab


From a5bf431ca94603145061043a0f781ce7fa074683 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 12 Aug 2024 16:26:01 -0700
Subject: [PATCH 344/391] Move AOM_FORCE_INLINE to aom_dsp/aom_dsp_common.h

The AOM_FORCE_INLINE macro is for libaom internal use. Move its
definition from the public header aom/aom_integer.h to the internal
header aom_dsp/aom_dsp_common.h.

Bug: aomedia:358402891
Change-Id: I4b15670060f816f00cdb9c9fa1a0f905c8cc9c9e
---
 aom/aom_integer.h              | 6 ------
 aom_dsp/aom_dsp_common.h       | 6 ++++++
 aom_dsp/aom_simd_inline.h      | 2 +-
 aom_dsp/arm/reinterpret_neon.h | 2 +-
 aom_dsp/arm/transpose_neon.h   | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/aom/aom_integer.h b/aom/aom_integer.h
index 9660301e51..8c17bd2972 100644
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -14,12 +14,6 @@
 /* get ptrdiff_t, size_t, wchar_t, NULL */
 #include <stddef.h>  // IWYU pragma: export
 
-#if defined(_MSC_VER)
-#define AOM_FORCE_INLINE __forceinline
-#else
-#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
-#endif
-
 /* Assume platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index c279ad1fc0..acd0498854 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -23,6 +23,12 @@
 extern "C" {
 #endif
 
+#if defined(_MSC_VER)
+#define AOM_FORCE_INLINE __forceinline
+#else
+#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
+#endif
+
 #define PI 3.141592653589793238462643383279502884
 
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
diff --git a/aom_dsp/aom_simd_inline.h b/aom_dsp/aom_simd_inline.h
index 41c29f6b00..85f9ec3226 100644
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -12,7 +12,7 @@
 #ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
 #define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
 
-#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
 
 #ifndef SIMD_INLINE
 #define SIMD_INLINE static AOM_FORCE_INLINE
diff --git a/aom_dsp/arm/reinterpret_neon.h b/aom_dsp/arm/reinterpret_neon.h
index c3951183f4..7ea5ab35e2 100644
--- a/aom_dsp/arm/reinterpret_neon.h
+++ b/aom_dsp/arm/reinterpret_neon.h
@@ -14,7 +14,7 @@
 
 #include <arm_neon.h>
 
-#include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
+#include "aom_dsp/aom_dsp_common.h"  // For AOM_FORCE_INLINE.
 #include "config/aom_config.h"
 
 #define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q)     \
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 5d0804ec19..91cda0f46c 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -14,7 +14,7 @@
 
 #include <arm_neon.h>
 
-#include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
+#include "aom_dsp/aom_dsp_common.h"  // For AOM_FORCE_INLINE.
 #include "config/aom_config.h"
 
 static inline void transpose_elems_u8_8x8(
-- 
GitLab


From 98339a35355d6870a87b3be3cb542cd630935f53 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Mon, 12 Aug 2024 13:22:29 -0700
Subject: [PATCH 345/391] cmake: check for -Wmissing-declarations support

Bug: aomedia:3416
Change-Id: Ic4eee8b758f91f8689f4a7ff832bca7803f383a9
---
 CMakeLists.txt                  | 6 ++++++
 build/cmake/aom_configure.cmake | 1 +
 2 files changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05dcd57d64..9f9a1b3958 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -472,6 +472,12 @@ if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI)
   add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
   if(NOT MSVC)
     target_compile_options(yuv PRIVATE -Wno-shadow)
+    # Many functions in libyuv trigger this warning when enabled with gcc and
+    # clang.
+    is_flag_present(AOM_CXX_FLAGS "-Wmissing-declarations" flag_present)
+    if(flag_present)
+      target_compile_options(yuv PRIVATE -Wno-missing-declarations)
+    endif()
     # Many functions in libyuv trigger this warning when enabled with clang.
     is_flag_present(AOM_CXX_FLAGS "-Wmissing-prototypes" flag_present)
     if(flag_present)
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 44fe8b7579..418ecfac19 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -343,6 +343,7 @@ else()
   add_compiler_flag_if_supported("-Wformat=2")
   add_c_flag_if_supported("-Wimplicit-function-declaration")
   add_compiler_flag_if_supported("-Wlogical-op")
+  add_compiler_flag_if_supported("-Wmissing-declarations")
   if(CMAKE_C_COMPILER_ID MATCHES "Clang")
     add_compiler_flag_if_supported("-Wmissing-prototypes")
   else()
-- 
GitLab


From 6fcc2fe4b8e33ff5f3ce671579728975685fa7df Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 13 Aug 2024 11:54:55 -0700
Subject: [PATCH 346/391] Document removal of AOM_INLINE from aom_integer.h

Document the removal of AOM_INLINE and AOM_FORCE_INLINE from
aom/aom_integer.h in the changelog for the libaom v3.10.0 release.

Bug: aomedia:358402891
Change-Id: I646f6ae6cda7041dbe0e160b9e5c20aab8380eb4
---
 CHANGELOG | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index f364f0db41..eb4d0b1052 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+yyyy-mm-dd v3.10.0
+  The definitions of the internal macros AOM_INLINE and AOM_FORCE_INLINE
+  have been removed from the public header aom/aom_integer.h.
+
 2024-06-07 v3.8.3
   This release includes several bug fixes. This release is ABI
   compatible with the last release. See
-- 
GitLab


From 4d0dea0826e035abd64437ae6e110907eaa33416 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 11:07:52 -0700
Subject: [PATCH 347/391] cmake: exclude ml*.c w/CONFIG_EXCLUDE_SIMD_MISMATCH=1

This and removing cnn_avx2.c fixes -Wmissing-prototypes warnings.

Bug: aomedia:3416
Change-Id: Icab81ce147bf89a2e655f6edeb0547e010827d58
---
 av1/av1.cmake | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 836281d494..e67ac8dff4 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -322,8 +322,13 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
-list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c"
-            "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
+# The functions defined in these files are removed from rtcd when
+# CONFIG_EXCLUDE_SIMD_MISMATCH=1.
+if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH)
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3
+              "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c"
+              "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
+endif()
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
             "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
@@ -350,9 +355,15 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
+
+# The functions defined in these files are removed from rtcd when
+# CONFIG_EXCLUDE_SIMD_MISMATCH=1.
+if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH)
+  list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c"
+              "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
+endif()
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
@@ -362,7 +373,6 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/ml_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
             "${AOM_ROOT}/av1/encoder/arm/quantize_neon.c"
@@ -371,6 +381,13 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c")
 
+# The functions defined in this file are removed from rtcd when
+# CONFIG_EXCLUDE_SIMD_MISMATCH=1.
+if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH)
+  list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/av1/encoder/arm/ml_neon.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
             "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")
 
-- 
GitLab


From 38935641ee25b28496edeab3f4fa417ded00c4a2 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 13 Aug 2024 17:04:18 -0700
Subject: [PATCH 348/391] Rename subpel_{x,y}_q4 back to subpel_{x,y}_qn

In commit 5971960 the subpel_x_qn and subpel_y_qn parameters were
renamed subpel_x_q4 and subpel_y_q4, but the parameter names in the
function declarations in av1_rtcd.h are subpel_x_qn and subpel_y_qn.
Change the parameter names back to match the names in the function
declarations.

Change-Id: Ia66e515a0c6aa9bb7cf3ff76fae0fab06826ef63
---
 av1/common/x86/convolve_2d_avx2.c | 14 +++++++-------
 av1/common/x86/convolve_avx2.c    | 16 ++++++++--------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index d23645ce94..e3ba0f19ef 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -141,19 +141,19 @@ static void convolve_2d_sr_general_avx2(
 void av1_convolve_2d_sr_avx2(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
-    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
-  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
-  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
+    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
+  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
 
   const bool use_general = (tap_x == 12 || tap_y == 12);
   if (use_general) {
     convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
-                                filter_params_x, filter_params_y, subpel_x_q4,
-                                subpel_y_q4, conv_params);
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params);
   } else {
     av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
                                         filter_params_x, filter_params_y,
-                                        subpel_x_q4, subpel_y_q4, conv_params);
+                                        subpel_x_qn, subpel_y_qn, conv_params);
   }
 }
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 98db5ae4da..dddaa4bca2 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -512,15 +512,15 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
                             uint8_t *dst, int32_t dst_stride, int32_t w,
                             int32_t h,
                             const InterpFilterParams *filter_params_y,
-                            const int32_t subpel_y_q4) {
-  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+                            const int32_t subpel_y_qn) {
+  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
 
   if (vert_tap == 12) {
     av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
-                                   filter_params_y, subpel_y_q4);
+                                   filter_params_y, subpel_y_qn);
   } else {
     av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
-                                       filter_params_y, subpel_y_q4);
+                                       filter_params_y, subpel_y_qn);
   }
 }
 
@@ -901,16 +901,16 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
                             uint8_t *dst, int32_t dst_stride, int32_t w,
                             int32_t h,
                             const InterpFilterParams *filter_params_x,
-                            const int32_t subpel_x_q4,
+                            const int32_t subpel_x_qn,
                             ConvolveParams *conv_params) {
-  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
 
   if (horz_tap == 12) {
     av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
-                                   filter_params_x, subpel_x_q4, conv_params);
+                                   filter_params_x, subpel_x_qn, conv_params);
   } else {
     av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
-                                       filter_params_x, subpel_x_q4,
+                                       filter_params_x, subpel_x_qn,
                                        conv_params);
   }
 }
-- 
GitLab


From dc8a68933b13dd1beceded8c072afddd9ad7bd9b Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 13 Aug 2024 15:48:28 -0700
Subject: [PATCH 349/391] Add the CONFIG_SVT_AV1 cmake option

If CONFIG_SVT_AV1 (default: 1) is set to 0, third_party/SVT-AV1 won't be
used.

Bug: 359670383
Change-Id: Ibe9a86abb0b900ac389f522bafde587577edc340
---
 aom_dsp/aom_dsp.cmake                 | 16 ++++++++++------
 av1/common/filter.h                   |  4 ++++
 av1/common/x86/convolve_2d_avx2.c     |  8 ++++++++
 av1/common/x86/convolve_avx2.c        | 12 ++++++++++++
 build/cmake/aom_config_defaults.cmake |  3 +++
 5 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index b021b0824f..46e6da3129 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -93,12 +93,16 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h"
-            "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h"
-            "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h"
-            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h"
-            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h"
-            "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h")
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
+
+if(CONFIG_SVT_AV1)
+  list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h"
+              "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h"
+              "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h"
+              "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h"
+              "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h")
+endif()
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 7073da2ad4..d1b76bd5a7 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -307,10 +307,14 @@ static inline int get_filter_tap(const InterpFilterParams *const filter_params,
   if (filter[1] | filter[6]) {
     return 6;
   }
+#if CONFIG_SVT_AV1
   if (filter[2] | filter[5]) {
     return 4;
   }
   return 2;
+#else
+  return 4;
+#endif
 }
 
 #ifdef __cplusplus
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index e3ba0f19ef..521326cab0 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -13,7 +13,9 @@
 
 #include "config/av1_rtcd.h"
 
+#if CONFIG_SVT_AV1
 #include "third_party/SVT-AV1/convolve_2d_avx2.h"
+#endif
 
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/aom_filter.h"
@@ -143,6 +145,7 @@ void av1_convolve_2d_sr_avx2(
     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
     const int32_t subpel_y_qn, ConvolveParams *conv_params) {
+#if CONFIG_SVT_AV1
   const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
   const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
 
@@ -156,4 +159,9 @@ void av1_convolve_2d_sr_avx2(
                                         filter_params_x, filter_params_y,
                                         subpel_x_qn, subpel_y_qn, conv_params);
   }
+#else
+  convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              subpel_y_qn, conv_params);
+#endif
 }
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index dddaa4bca2..d250d88427 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -13,7 +13,9 @@
 
 #include "config/av1_rtcd.h"
 
+#if CONFIG_SVT_AV1
 #include "third_party/SVT-AV1/convolve_avx2.h"
+#endif
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/convolve_avx2.h"
@@ -513,6 +515,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
                             int32_t h,
                             const InterpFilterParams *filter_params_y,
                             const int32_t subpel_y_qn) {
+#if CONFIG_SVT_AV1
   const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
 
   if (vert_tap == 12) {
@@ -522,6 +525,10 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
     av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
                                        filter_params_y, subpel_y_qn);
   }
+#else
+  av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_y, subpel_y_qn);
+#endif
 }
 
 static inline void av1_convolve_x_sr_general_avx2(
@@ -903,6 +910,7 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
                             const InterpFilterParams *filter_params_x,
                             const int32_t subpel_x_qn,
                             ConvolveParams *conv_params) {
+#if CONFIG_SVT_AV1
   const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
 
   if (horz_tap == 12) {
@@ -913,4 +921,8 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
                                        filter_params_x, subpel_x_qn,
                                        conv_params);
   }
+#else
+  av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, subpel_x_qn, conv_params);
+#endif
 }
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 2f11828ff1..ae0842c6e9 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -75,6 +75,9 @@ set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
 set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
 set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
 set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.")
+# Set CONFIG_SVT_AV1 to 0 to avoid the BSD 3-Clause Clear License used by the
+# code in third_party/SVT-AV1/.
+set_aom_config_var(CONFIG_SVT_AV1 1 "Enables SVT-AV1 AVX2 convolution support.")
 
 set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
                    "Build with high bitdepth support.")
-- 
GitLab


From fc170aa1292db916bb427606b3bea3d2ff8a134c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:32:38 -0700
Subject: [PATCH 350/391] tpl_model.c: remove av1_estimate_txfm_block_entropy()

This function was added in:
d6f767b482 Estimate coeff cost based on transform stats
but never used.

Bug: aomedia:3416,aomedia:3018
Change-Id: I867712553fcddd27965d6a8e1cb04878992ed6bd
---
 av1/encoder/tpl_model.c | 18 ------------------
 av1/encoder/tpl_model.h | 16 ----------------
 2 files changed, 34 deletions(-)

diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 9747aea8b3..e9319b182b 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -2136,24 +2136,6 @@ double av1_estimate_coeff_entropy(double q_step, double b,
   }
 }
 
-double av1_estimate_txfm_block_entropy(int q_index,
-                                       const double *abs_coeff_mean,
-                                       int *qcoeff_arr, int coeff_num) {
-  double zero_bin_ratio = 2;
-  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
-  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
-  double est_rate = 0;
-  // dc coeff
-  est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
-                                         zero_bin_ratio, qcoeff_arr[0]);
-  // ac coeff
-  for (int i = 1; i < coeff_num; ++i) {
-    est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
-                                           zero_bin_ratio, qcoeff_arr[i]);
-  }
-  return est_rate;
-}
-
 #if CONFIG_RD_COMMAND
 void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
   FILE *fptr = fopen(filepath, "r");
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 434bf2d2b8..eaeeaf2794 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -600,22 +600,6 @@ void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats);
 double av1_estimate_coeff_entropy(double q_step, double b,
                                   double zero_bin_ratio, int qcoeff);
 
-/*!\brief  Estimate entropy of a transform block using Laplace dsitribution
- *
- *\ingroup tpl_modelling
- *
- * \param[in]    q_index         quantizer index
- * \param[in]    abs_coeff_mean  array of mean absolute deviations
- * \param[in]    qcoeff_arr      array of quantized coefficients
- * \param[in]    coeff_num       number of coefficients per transform block
- *
- * \return estimated transform block entropy
- *
- */
-double av1_estimate_txfm_block_entropy(int q_index,
-                                       const double *abs_coeff_mean,
-                                       int *qcoeff_arr, int coeff_num);
-
 // TODO(angiebird): Add doxygen description here.
 int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
                             int64_t srcrf_dist, int pix_num);
-- 
GitLab


From 7bff66059f2816a6aa872945225ce63cc4e65577 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:34:59 -0700
Subject: [PATCH 351/391] firstpass.c: remove av1_firstpass_info_past_count()

This function was added in:
d8cfe16d90 Let firstpass_info support past stats
but never used.

Bug: aomedia:3416,aomedia:3069,aomedia:3070
Change-Id: Ie319ee89aebb256191e2cd03d0c38972edec4b03
---
 av1/encoder/firstpass.c |  8 --------
 av1/encoder/firstpass.h | 15 ---------------
 2 files changed, 23 deletions(-)

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 416ee42526..8bbf79d3f9 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -1590,11 +1590,3 @@ int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
   }
   return 0;
 }
-
-int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
-                                  int offset_from_cur) {
-  if (offset_from_cur >= -firstpass_info->past_stats_count) {
-    return offset_from_cur + firstpass_info->past_stats_count;
-  }
-  return 0;
-}
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index 0832494ee1..bec9ece677 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -323,21 +323,6 @@ const FIRSTPASS_STATS *av1_firstpass_info_peek(
 int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
                                     int offset_from_cur);
 
-/*!\brief Count the past stats before the target in firstpass_info
- * Note that the target stats will NOT be counted.
- * The target index is as follows.
- * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
- *
- * \ingroup rate_control
- * \param[in]  firstpass_info    struct of firstpass_info.
- * \param[in]  offset_from_cur  target stats's index offset
- *                               from cur_index.
- * \return Number of stats in the past before the target stats
- *         excluding itself.
- */
-int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
-                                  int offset_from_cur);
-
 /*!\cond */
 #define FC_ANIMATION_THRESH 0.15
 enum {
-- 
GitLab


From 129a3eea1fc45a0aa9b7e77844404eae9792c6ff Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 14 Aug 2024 15:57:06 -0700
Subject: [PATCH 352/391] Use Doxygen's \deprecated command

Change-Id: I961ae05e80b17306e69374d109168cb8af677bad
---
 aom/aomcx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 3cf3991213..2a9a090b4d 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1530,7 +1530,7 @@ enum aome_enc_control_id {
   /*!\brief Codec control to set the maximum number of consecutive frame drops,
    * in units of frames, allowed for the frame dropper in 1 pass
    * CBR mode, int parameter. Value of zero has no effect.
-   * Deprecated: use the new control AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR.
+   * \deprecated Use the new control AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR.
    */
   AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
 
-- 
GitLab


From f6b976de024e998f19d3f8c3ed045e3273fcc288 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 14 Aug 2024 13:34:18 -0700
Subject: [PATCH 353/391] cpu_used_firstpass_test: move early exit check up

Nothing but the reference encode is performed when the first pass speed
is equivalent to the second pass speed. There are enough encode tests
with additional expectations that we don't need to run these. They are
slow under the address and undefined behavior sanitizers.

Change-Id: Ifb27ee7611e76bfb378c657ba873a4f6876af5ab
---
 test/cpu_used_firstpass_test.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
index 46d97f444c..e1d389f238 100644
--- a/test/cpu_used_firstpass_test.cc
+++ b/test/cpu_used_firstpass_test.cc
@@ -73,6 +73,10 @@ class CpuUsedFirstpassTest
   double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; }
 
   void DoTest() {
+    if (GET_PARAM(1) == second_pass_cpu_used_) {
+      GTEST_SKIP() << "Reference cpu used values match test cpu used values.";
+    }
+
     libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
                                        cfg_.g_timebase.den, cfg_.g_timebase.num,
                                        0, 30);
@@ -84,7 +88,6 @@ class CpuUsedFirstpassTest
     ref_psnr = GetAveragePsnr();
 
     first_pass_cpu_used_ = GET_PARAM(1);
-    if (first_pass_cpu_used_ == second_pass_cpu_used_) return;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     psnr_diff = std::abs(ref_psnr - GetAveragePsnr());
     EXPECT_LT(psnr_diff, GetPsnrDiffThreshold())
-- 
GitLab


From b9c9b61662698d48beba3af813512cb4aa208a11 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 30 Jul 2024 12:10:27 -0700
Subject: [PATCH 354/391] Round framerate later in kf_boost adjustment

Match the VP8 and VP9 code.

The current code comes from the following change in commit 9e96d1b:

    3) use int framerate (rounded) in setting target size in
       calc_iframe_target_size_one_pass_cbr

It would be good if VP8, VP9, and AV1 calculate kf_boost in the same
way.

test_aom_rc still passes with this change. However, if I undo the change
to av1_calc_iframe_target_size_one_pass_cbr() in commit 9e96d1b,
test_aom_rc fails because rc_api_->GetQP() is less than qp by 1.

Change-Id: I06670ed28fcd225e413bc54e10f61cb7f39e3a00
---
 av1/encoder/ratectrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 8135da8cde..b8c25387d2 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2821,9 +2821,9 @@ int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
     }
   } else {
     int kf_boost = 32;
-    int framerate = (int)round(cpi->framerate);
+    double framerate = cpi->framerate;
 
-    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    kf_boost = AOMMAX(kf_boost, (int)round(2 * framerate - 16));
     if (rc->frames_since_key < framerate / 2) {
       kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
     }
-- 
GitLab


From 131c3d5fe27a5b34afcc6c24585006bb3ce96157 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 14 Aug 2024 13:04:10 -0700
Subject: [PATCH 355/391] allintra_end_to_end_test: mark some tests as 'Large'

cpu-used=5, and cpu-used=[6, 8] & aq-mode=[1, 2]. This reduces the
number of combinations for the presubmit where the address and undefined
sanitizers are slow.

Change-Id: I4c0c2f4be7a25236321f67ec151861c57d8125d9
---
 test/allintra_end_to_end_test.cc | 39 ++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index e588f610a1..ba0f0a9d60 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -125,11 +125,28 @@ class AllIntraEndToEndTest
   int enable_tx_size_search_;
 };
 
+using AllIntraEndToEndTestLarge = AllIntraEndToEndTest;
+using AllIntraEndToEndTestLarge2 = AllIntraEndToEndTestLarge;
+
 TEST_P(AllIntraEndToEndTest, EndToEndNoFailure) { DoTest(); }
+TEST_P(AllIntraEndToEndTestLarge, EndToEndNoFailure) { DoTest(); }
+TEST_P(AllIntraEndToEndTestLarge2, EndToEndNoFailure) { DoTest(); }
 
 AV1_INSTANTIATE_TEST_SUITE(AllIntraEndToEndTest,
                            ::testing::ValuesIn(kTestVectors),
-                           ::testing::Range(5, 9), ::testing::Range(0, 4),
+                           ::testing::Range(6, 9), ::testing::Values(0, 3),
+                           ::testing::Values(1), ::testing::Values(1),
+                           ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AllIntraEndToEndTestLarge,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(6, 9), ::testing::Values(1, 2),
+                           ::testing::Values(1), ::testing::Values(1),
+                           ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AllIntraEndToEndTestLarge2,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(5), ::testing::Range(0, 4),
                            ::testing::Values(1), ::testing::Values(1),
                            ::testing::Values(0, 1));
 
@@ -138,7 +155,25 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::ValuesIn(kTestVectors), ::testing::Range(5, 9),
+        ::testing::ValuesIn(kTestVectors), ::testing::Range(6, 9),
+        ::testing::Values(0, 3), ::testing::Values(6), ::testing::Values(1),
+        ::testing::Values(0, 1)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AV1MultiThreaded, AllIntraEndToEndTestLarge,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::ValuesIn(kTestVectors), ::testing::Range(6, 9),
+        ::testing::Values(1, 2), ::testing::Values(6), ::testing::Values(1),
+        ::testing::Values(0, 1)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AV1MultiThreaded, AllIntraEndToEndTestLarge2,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::ValuesIn(kTestVectors), ::testing::Values(5),
         ::testing::Range(0, 4), ::testing::Values(6), ::testing::Values(1),
         ::testing::Values(0, 1)));
 
-- 
GitLab


From 4a0eca4907ae85e3f1230ae10394770062b516c0 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 16:55:49 -0700
Subject: [PATCH 356/391] make aom_sse_to_psnr() static
 w/CONFIG_INTERNAL_STATS=0

Bug: aomedia:3416
Change-Id: I3533d50bb6a3d2096e5ab96d147ec5a57391022b
---
 aom_dsp/psnr.c | 11 ++++++++++-
 aom_dsp/psnr.h |  3 +++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index b174c1e92f..af166f0013 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -12,12 +12,19 @@
 #include <assert.h>
 #include <math.h>
 
+#include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/psnr.h"
 #include "aom_scale/yv12config.h"
 
-double aom_sse_to_psnr(double samples, double peak, double sse) {
+#if CONFIG_INTERNAL_STATS
+#define STATIC
+#else
+#define STATIC static
+#endif  // CONFIG_INTERNAL_STATS
+
+STATIC double aom_sse_to_psnr(double samples, double peak, double sse) {
   if (sse > 0.0) {
     const double psnr = 10.0 * log10(samples * peak * peak / sse);
     return psnr > MAX_PSNR ? MAX_PSNR : psnr;
@@ -26,6 +33,8 @@ double aom_sse_to_psnr(double samples, double peak, double sse) {
   }
 }
 
+#undef STATIC
+
 static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
                            int b_stride, int w, int h) {
   int i, j;
diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index f4fd1d6418..87718f7afb 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h
@@ -13,6 +13,7 @@
 #define AOM_AOM_DSP_PSNR_H_
 
 #include "aom_scale/yv12config.h"
+#include "config/aom_config.h"
 
 #define MAX_PSNR 100.0
 
@@ -29,6 +30,7 @@ typedef struct {
   uint32_t samples_hbd[4];  // total/y/u/v when input-bit-depth < bit-depth
 } PSNR_STATS;
 
+#if CONFIG_INTERNAL_STATS
 /*!\brief Converts SSE to PSNR
  *
  * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
@@ -38,6 +40,7 @@ typedef struct {
  * \param[in]    sse           Sum of squared errors
  */
 double aom_sse_to_psnr(double samples, double peak, double sse);
+#endif  // CONFIG_INTERNAL_STATS
 uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
                        int vstart, int height);
 uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-- 
GitLab


From acae43ef62d6f7543d7b8023ec8f7747d955c7fa Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:01:18 -0700
Subject: [PATCH 357/391] av1_loopfilter.c: make av1_get_filter_level() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: Ifc5118d0ebac310350124f581dfd6066942d1307
---
 av1/common/av1_loopfilter.c | 30 +++++++++++++++---------------
 av1/common/av1_loopfilter.h |  4 ----
 av1/encoder/picklpf.c       |  3 ++-
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 83549597e1..043c51bb90 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -65,9 +65,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   }
 }
 
-uint8_t av1_get_filter_level(const AV1_COMMON *cm,
-                             const loop_filter_info_n *lfi_n, const int dir_idx,
-                             int plane, const MB_MODE_INFO *mbmi) {
+static uint8_t get_filter_level(const AV1_COMMON *cm,
+                                const loop_filter_info_n *lfi_n,
+                                const int dir_idx, int plane,
+                                const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
   if (cm->delta_q_info.delta_lf_present_flag) {
     int8_t delta_lf;
@@ -266,7 +267,7 @@ static TX_SIZE set_lpf_parameters(
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
     {
       const uint32_t curr_level =
-          av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
       const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
       uint32_t level = curr_level;
       if (coord) {
@@ -281,7 +282,7 @@ static TX_SIZE set_lpf_parameters(
               xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert);
 
           const uint32_t pv_lvl =
-              av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
 
           const int pv_skip_txfm =
               mi_prev->skip_txfm && is_inter_block(mi_prev);
@@ -689,10 +690,10 @@ static AOM_FORCE_INLINE void set_one_param_for_line_luma(
     }
     assert(mi_prev);
     uint8_t level =
-        av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi);
+        get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi);
     if (!level) {
-      level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y,
-                                   mi_prev);
+      level =
+          get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mi_prev);
     }
 
     const int32_t pu_edge = mi_prev != mbmi;
@@ -723,7 +724,7 @@ static AOM_FORCE_INLINE void set_one_param_for_line_luma(
 }
 
 // Similar to set_lpf_parameters, but does so one row/col at a time to reduce
-// calls to \ref get_transform_size and \ref av1_get_filter_level
+// calls to \ref get_transform_size and \ref get_filter_level
 static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma(
     AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf,
     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
@@ -822,18 +823,17 @@ static AOM_FORCE_INLINE void set_one_param_for_line_chroma(
       *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts];
     }
 
-    uint8_t level =
-        av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+    uint8_t level = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
     if (!level) {
-      level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+      level = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
     }
 #ifndef NDEBUG
     if (joint_filter_chroma) {
       uint8_t v_level =
-          av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi);
+          get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi);
       if (!v_level) {
-        v_level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V,
-                                       mi_prev);
+        v_level =
+            get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mi_prev);
       }
       assert(level == v_level);
     }
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 2ecc47c72b..6c02f9fa23 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -139,10 +139,6 @@ void av1_filter_block_plane_horz_opt_chroma(
     TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
     int num_mis_in_lpf_unit_height_log2);
 
-uint8_t av1_get_filter_level(const struct AV1Common *cm,
-                             const loop_filter_info_n *lfi_n, const int dir_idx,
-                             int plane, const MB_MODE_INFO *mbmi);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index abcbb7d481..29a8abdfde 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -72,7 +72,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
   if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
 
-  // set base filters for use of av1_get_filter_level when in DELTA_LF mode
+  // set base filters for use of get_filter_level (av1_loopfilter.c) when in
+  // DELTA_LF mode
   switch (plane) {
     case 0:
       cm->lf.filter_level[0] = filter_level[0];
-- 
GitLab


From f46501b1e410d45ac6b894a1641a6331bcfbaacc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:07:56 -0700
Subject: [PATCH 358/391] restoration.c: make av1_foreach_rest_unit_in_plane()
 static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: I3fa550d15049ceb4c53a80c05446977f1d988155
---
 av1/common/restoration.c | 83 ++++++++++++++++++++--------------------
 av1/common/restoration.h |  6 ---
 2 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 4e0dee54c2..8aaddef4d5 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1140,6 +1140,46 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
   }
 }
 
+// Call on_rest_unit for each loop restoration unit in the plane.
+static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                       rest_unit_visitor_t on_rest_unit,
+                                       void *priv, int32_t *tmpbuf,
+                                       RestorationLineBuffers *rlbs) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int hnum_rest_units = rsi->horz_units;
+  const int vnum_rest_units = rsi->vert_units;
+  const int unit_size = rsi->restoration_unit_size;
+
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
+  const int ext_size = unit_size * 3 / 2;
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+  int y0 = 0, i = 0;
+  while (y0 < plane_h) {
+    int remaining_h = plane_h - y0;
+    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+    RestorationTileLimits limits;
+    limits.v_start = y0;
+    limits.v_end = y0 + h;
+    assert(limits.v_end <= plane_h);
+    // Offset upwards to align with the restoration processing stripe
+    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+    limits.v_start = AOMMAX(0, limits.v_start - voffset);
+    if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
+                                 hnum_rest_units, vnum_rest_units, plane, priv,
+                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
+                                 av1_lr_sync_write_dummy, NULL, cm->error);
+
+    y0 += h;
+    ++i;
+  }
+}
+
 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
                                         int num_planes) {
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
@@ -1149,8 +1189,8 @@ static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
       continue;
     }
 
-    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
-                                   &ctxt[plane], cm->rst_tmpbuf, cm->rlbs);
+    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
+                               cm->rst_tmpbuf, cm->rlbs);
   }
 }
 
@@ -1234,45 +1274,6 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
   (void)plane;
 }
 
-void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
-                                    rest_unit_visitor_t on_rest_unit,
-                                    void *priv, int32_t *tmpbuf,
-                                    RestorationLineBuffers *rlbs) {
-  const RestorationInfo *rsi = &cm->rst_info[plane];
-  const int hnum_rest_units = rsi->horz_units;
-  const int vnum_rest_units = rsi->vert_units;
-  const int unit_size = rsi->restoration_unit_size;
-
-  const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params->subsampling_y;
-  const int ext_size = unit_size * 3 / 2;
-  int plane_w, plane_h;
-  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
-
-  int y0 = 0, i = 0;
-  while (y0 < plane_h) {
-    int remaining_h = plane_h - y0;
-    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
-
-    RestorationTileLimits limits;
-    limits.v_start = y0;
-    limits.v_end = y0 + h;
-    assert(limits.v_end <= plane_h);
-    // Offset upwards to align with the restoration processing stripe
-    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
-    limits.v_start = AOMMAX(0, limits.v_start - voffset);
-    if (limits.v_end < plane_h) limits.v_end -= voffset;
-
-    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
-                                 hnum_rest_units, vnum_rest_units, plane, priv,
-                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
-                                 av1_lr_sync_write_dummy, NULL, cm->error);
-
-    y0 += h;
-    ++i;
-  }
-}
-
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index a73190e1dc..0b5d62bf6b 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -419,12 +419,6 @@ typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
 typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
                                 const int sb_cols, int plane);
 
-// Call on_rest_unit for each loop restoration unit in the plane.
-void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
-                                    rest_unit_visitor_t on_rest_unit,
-                                    void *priv, int32_t *tmpbuf,
-                                    RestorationLineBuffers *rlbs);
-
 // Return 1 iff the block at mi_row, mi_col with size bsize is a
 // top-level superblock containing the top-left corner of at least one
 // loop restoration unit.
-- 
GitLab


From 648ff2d87703389885ff1067624da9490e3537af Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:09:05 -0700
Subject: [PATCH 359/391] aq_cyclicrefresh.c: make
 av1_cyclic_refresh_reset_resize() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: I06a865117dd472659652cd6316ef2c8fb6a9ebf4
---
 av1/encoder/aq_cyclicrefresh.c | 28 ++++++++++++++--------------
 av1/encoder/aq_cyclicrefresh.h |  2 --
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index e4d942a0fd..5c087267c7 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -540,6 +540,19 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   }
 }
 
+static void cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+  cr->sb_index = 0;
+  cr->last_sb_index = 0;
+  cpi->refresh_frame.golden_frame = true;
+  cr->apply_cyclic_refresh = 0;
+  cr->counter_encode_maxq_scene_change = 0;
+  cr->percent_refresh_adjustment = 5;
+  cr->rate_ratio_qdelta_adjustment = 0.25;
+}
+
 // Setup cyclic background refresh: set delta q and segmentation map.
 void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -560,7 +573,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
        cm->height != cm->prev_frame->height) &&
       cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers;
 
-  if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
+  if (resolution_change) cyclic_refresh_reset_resize(cpi);
   if (!cr->apply_cyclic_refresh) {
     // Don't disable and set seg_map to 0 if active_maps is enabled, unless
     // whole frame is set as inactive (since we only apply cyclic_refresh to
@@ -653,19 +666,6 @@ int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
   return cr->rdmult;
 }
 
-void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-  cr->sb_index = 0;
-  cr->last_sb_index = 0;
-  cpi->refresh_frame.golden_frame = true;
-  cr->apply_cyclic_refresh = 0;
-  cr->counter_encode_maxq_scene_change = 0;
-  cr->percent_refresh_adjustment = 5;
-  cr->rate_ratio_qdelta_adjustment = 0.25;
-}
-
 int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int qindex = cpi->common.quant_params.base_qindex;
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index 245af6ccc6..0613c4386d 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -307,8 +307,6 @@ void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
 
 int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
-void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
-
 int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
 
 static inline int cyclic_refresh_segment_id_boosted(int segment_id) {
-- 
GitLab


From 0cbf911d705f5e1c98ab9838531d2052e0d7ceae Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:13:22 -0700
Subject: [PATCH 360/391] encodemb.c: make av1_encode_block_intra() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: Iad38a45bda661b4955ef54092765e9a8f4d7708b
---
 av1/encoder/encodemb.c | 33 ++++++++++++++++-----------------
 av1/encoder/encodemb.h |  3 ---
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index c53e8b94d7..a91506b043 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -717,23 +717,9 @@ void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   }
 }
 
-static void encode_block_intra_and_set_context(int plane, int block,
-                                               int blk_row, int blk_col,
-                                               BLOCK_SIZE plane_bsize,
-                                               TX_SIZE tx_size, void *arg) {
-  av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                         arg);
-
-  struct encode_b_args *const args = arg;
-  MACROBLOCK *x = args->x;
-  ENTROPY_CONTEXT *a = &args->ta[blk_col];
-  ENTROPY_CONTEXT *l = &args->tl[blk_row];
-  av1_set_txb_context(x, plane, block, tx_size, a, l);
-}
-
-void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            void *arg) {
+static void encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
   struct encode_b_args *const args = arg;
   const AV1_COMP *const cpi = args->cpi;
   const AV1_COMMON *const cm = &cpi->common;
@@ -842,6 +828,19 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   }
 }
 
+static void encode_block_intra_and_set_context(int plane, int block,
+                                               int blk_row, int blk_col,
+                                               BLOCK_SIZE plane_bsize,
+                                               TX_SIZE tx_size, void *arg) {
+  encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *x = args->x;
+  ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
 void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
                                   TRELLIS_OPT_TYPE enable_optimize_b) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 5f539c48b0..b35265cc2e 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -146,9 +146,6 @@ static inline void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
   memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
 }
 
-void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
-
 void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
                                   TRELLIS_OPT_TYPE enable_optimize_b);
-- 
GitLab


From 9b8fa9166b107d157effa65f814c61e7e97966fe Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:15:15 -0700
Subject: [PATCH 361/391] ethread.c: make av1_compute_num_enc_workers() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: I93d5d458198631b87d4571be596a6c7b148713f4
---
 av1/encoder/ethread.c | 17 ++++++++---------
 av1/encoder/ethread.h |  2 --
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6d3171be3d..82b1753f00 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -1724,7 +1724,7 @@ int av1_get_max_num_workers(const AV1_COMP *cpi) {
 }
 
 // Computes the number of workers for encoding stage (row/tile multi-threading)
-int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
+static int compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
   if (max_workers <= 1) return 1;
   if (cpi->oxcf.row_mt)
     return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -3376,7 +3376,7 @@ static inline int compute_num_tf_workers(const AV1_COMP *cpi) {
   // found to improve speed. Hence the thread assignment for single-pass encode
   // is kept based on compute_num_enc_workers().
   if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
-    return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+    return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 
   if (cpi->oxcf.max_threads <= 1) return 1;
 
@@ -3389,22 +3389,22 @@ static inline int compute_num_tf_workers(const AV1_COMP *cpi) {
 
 // Computes num_workers for tpl multi-threading.
 static inline int compute_num_tpl_workers(AV1_COMP *cpi) {
-  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop filter multi-threading.
 static inline int compute_num_lf_workers(AV1_COMP *cpi) {
-  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for cdef multi-threading.
 static inline int compute_num_cdef_workers(AV1_COMP *cpi) {
-  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop-restoration multi-threading.
 static inline int compute_num_lr_workers(AV1_COMP *cpi) {
-  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for pack bitstream multi-threading.
@@ -3434,14 +3434,13 @@ static int compute_num_mod_workers(AV1_COMP *cpi,
       if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
         num_mod_workers = 0;
       else
-        num_mod_workers =
-            av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+        num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
       break;
     case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
     case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
     case MOD_GME: num_mod_workers = 1; break;
     case MOD_ENC:
-      num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
       break;
     case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
     case MOD_CDEF_SEARCH:
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 8b89ebc11a..99d8c70374 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -118,8 +118,6 @@ void av1_write_tile_obu_mt(
     unsigned int *max_tile_size, uint32_t *const obu_header_size,
     uint8_t **tile_data_start, const int num_workers);
 
-int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
-
 int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
 
 int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
-- 
GitLab


From 658900ead4508d9760974abc5d272d409ece1940 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:19:50 -0700
Subject: [PATCH 362/391] external_partition.c: make av1_ext_part_init() static

This function is unused outside of this file.

+ make av1_ext_part_send_partition_stats() and
av1_get_ext_part_decision_mode() conditional on
CONFIG_PARTITION_SEARCH_ORDER.

Bug: aomedia:3416
Change-Id: I6702860cb9b46c7c964c93b2ffee610852cfbc40
---
 av1/encoder/external_partition.c | 29 ++++++++++++++++-------------
 av1/encoder/external_partition.h | 11 ++++++-----
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/av1/encoder/external_partition.c b/av1/encoder/external_partition.c
index d72eab0883..45bd433eb1 100644
--- a/av1/encoder/external_partition.c
+++ b/av1/encoder/external_partition.c
@@ -11,6 +11,7 @@
 
 #include "av1/common/common.h"
 #include "av1/encoder/external_partition.h"
+#include "config/aom_config.h"
 
 aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
                                     aom_ext_part_config_t config,
@@ -35,7 +36,7 @@ aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
   return AOM_CODEC_OK;
 }
 
-aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+static aom_codec_err_t ext_part_init(ExtPartController *ext_part_controller) {
   if (ext_part_controller == NULL) {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -54,7 +55,7 @@ aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
       return AOM_CODEC_ERROR;
     }
   }
-  return av1_ext_part_init(ext_part_controller);
+  return ext_part_init(ext_part_controller);
 }
 
 bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
@@ -69,25 +70,26 @@ bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
   return true;
 }
 
-bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
-                                       const aom_partition_stats_t *stats) {
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features) {
   assert(ext_part_controller != NULL);
   assert(ext_part_controller->ready);
-  assert(stats != NULL);
-  const aom_ext_part_status_t status =
-      ext_part_controller->funcs.send_partition_stats(
-          ext_part_controller->model, stats);
+  assert(features != NULL);
+  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+      ext_part_controller->model, features);
   if (status != AOM_EXT_PART_OK) return false;
   return true;
 }
 
-bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
-                                const aom_partition_features_t *features) {
+#if CONFIG_PARTITION_SEARCH_ORDER
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats) {
   assert(ext_part_controller != NULL);
   assert(ext_part_controller->ready);
-  assert(features != NULL);
-  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
-      ext_part_controller->model, features);
+  assert(stats != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.send_partition_stats(
+          ext_part_controller->model, stats);
   if (status != AOM_EXT_PART_OK) return false;
   return true;
 }
@@ -96,3 +98,4 @@ aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
     const ExtPartController *ext_part_controller) {
   return ext_part_controller->funcs.decision_mode;
 }
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
diff --git a/av1/encoder/external_partition.h b/av1/encoder/external_partition.h
index bd299fc5f3..e1ed2dd139 100644
--- a/av1/encoder/external_partition.h
+++ b/av1/encoder/external_partition.h
@@ -16,6 +16,7 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_external_partition.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,21 +35,21 @@ aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
                                     aom_ext_part_config_t config,
                                     ExtPartController *ext_part_controller);
 
-aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
-
 aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
 
 bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
                                          aom_partition_decision_t *decision);
 
-bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
-                                       const aom_partition_stats_t *stats);
-
 bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
                                 const aom_partition_features_t *features);
 
+#if CONFIG_PARTITION_SEARCH_ORDER
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats);
+
 aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
     const ExtPartController *ext_part_controller);
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
 
 /*!\endcond */
 #ifdef __cplusplus
-- 
GitLab


From fff567c78af52f34f712646526452b34911e2a8a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:22:02 -0700
Subject: [PATCH 363/391] interp_search.c: make av1_find_interp_filter_match()
 static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: I8d72c13c8cfd7959fb1479261b4249667f5fa199
---
 av1/encoder/interp_search.c | 4 ++--
 av1/encoder/interp_search.h | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 8011f3c0d1..f639e75dbc 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -90,7 +90,7 @@ static inline int find_interp_filter_in_stats(
   return -1;  // no match result found
 }
 
-int av1_find_interp_filter_match(
+static int find_interp_filter_match(
     MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
     const InterpFilter assign_filter, const int need_search,
     INTERPOLATION_FILTER_STATS *interp_filter_stats,
@@ -673,7 +673,7 @@ int64_t av1_interpolation_filter_search(
   int match_found_idx = -1;
   const InterpFilter assign_filter = cm->features.interp_filter;
 
-  match_found_idx = av1_find_interp_filter_match(
+  match_found_idx = find_interp_filter_match(
       mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
       args->interp_filter_stats_idx);
 
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index d7df92f0d3..28b036de76 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -184,12 +184,6 @@ static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
   { 0x00000002 }, { 0x00010002 }, { 0x00020002 },  // y = 2
 };
 
-int av1_find_interp_filter_match(
-    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
-    const InterpFilter assign_filter, const int need_search,
-    INTERPOLATION_FILTER_STATS *interp_filter_stats,
-    int interp_filter_stats_idx);
-
 int64_t av1_interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
-- 
GitLab


From 6e1ce4bb96550c6a05fd50440d28edf75a434ac6 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:23:05 -0700
Subject: [PATCH 364/391] level.c: make some functions static

av1_decoder_model_init() and av1_decoder_model_process_frame() are
unused outside of this file. av1_decoder_model_print_status() is debug
only, so wrap it in `#if 0` to allow it to be used as needed.

Bug: aomedia:3416
Change-Id: I6a911f0fe762e108690e864c37801e5a1c576089
---
 av1/encoder/level.c | 18 +++++++++++-------
 av1/encoder/level.h | 10 ----------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/av1/encoder/level.c b/av1/encoder/level.c
index 7d35fcc3bd..9a804669b5 100644
--- a/av1/encoder/level.c
+++ b/av1/encoder/level.c
@@ -560,6 +560,8 @@ static double get_removal_time(int mode, int num_decoded_frame,
   }
 }
 
+#if 0
+// Print the status of the decoder model (for debugging).
 void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
   printf(
       "\n status %d, num_frame %3d, num_decoded_frame %3d, "
@@ -578,10 +580,12 @@ void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
            this_buffer->presentation_time);
   }
 }
+#endif
 
 // op_index is the operating point index.
-void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
-                            int op_index, DECODER_MODEL *const decoder_model) {
+static void decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+                               int op_index,
+                               DECODER_MODEL *const decoder_model) {
   decoder_model->status = DECODER_MODEL_OK;
   decoder_model->level = level;
 
@@ -719,9 +723,9 @@ DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
   }
 }
 
-void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
-                                     size_t coded_bits,
-                                     DECODER_MODEL *const decoder_model) {
+static void decoder_model_process_frame(const AV1_COMP *const cpi,
+                                        size_t coded_bits,
+                                        DECODER_MODEL *const decoder_model) {
   if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -917,7 +921,7 @@ void av1_init_level_info(AV1_COMP *cpi) {
         // exceeds level constraints.
         this_model->status = DECODER_MODEL_DISABLED;
       } else {
-        av1_decoder_model_init(cpi, level, op_index, this_model);
+        decoder_model_init(cpi, level, op_index, this_model);
       }
     }
   }
@@ -1337,7 +1341,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
 
     DECODER_MODEL *const decoder_models = level_info->decoder_models;
     for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
-      av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
+      decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
     }
 
     // Check whether target level is met.
diff --git a/av1/encoder/level.h b/av1/encoder/level.h
index d9d642fb04..077825e7b9 100644
--- a/av1/encoder/level.h
+++ b/av1/encoder/level.h
@@ -189,16 +189,6 @@ aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
                                              const AV1LevelParams *level_params,
                                              int *target_seq_level_idx);
 
-// Print the status of the decoder model(for debugging).
-void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
-
-void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
-                            int op_index, DECODER_MODEL *const decoder_model);
-
-void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
-                                     size_t coded_bits,
-                                     DECODER_MODEL *const decoder_model);
-
 // This function uses the decoder model to check whether there could be
 // SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not
 // update the content of decoder_model, and can be used to target certain
-- 
GitLab


From 41f75d142d6420ed5ae81187964af6f3b50354dc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:26:38 -0700
Subject: [PATCH 365/391] motion_search_facade.c: make a function static

av1_compound_single_motion_search_interinter() is unused outside of this
file.

Bug: aomedia:3416
Change-Id: I899553a428f423b30059d1ac0279632ef57ba188
---
 av1/encoder/motion_search_facade.c | 6 +++---
 av1/encoder/motion_search_facade.h | 4 ----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index c45600291a..5169989297 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -881,7 +881,7 @@ static inline void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Wrapper for av1_compound_single_motion_search, for the common case
 // where the second prediction is also an inter mode.
-int av1_compound_single_motion_search_interinter(
+static int compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -919,8 +919,8 @@ static inline void do_masked_motion_search_indexed(
   tmp_mv[0].as_int = cur_mv[0].as_int;
   tmp_mv[1].as_int = cur_mv[1].as_int;
   if (which == 0 || which == 1) {
-    av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
-                                                 mask_stride, rate_mv, which);
+    compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+                                             mask_stride, rate_mv, which);
   } else if (which == 2) {
     const int joint_me_num_refine_iter =
         cpi->sf.inter_sf.enable_fast_compound_mode_search == 2
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index 8cf853c29f..a1cbb77a79 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -48,10 +48,6 @@ int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
                                           const BLOCK_SIZE bsize,
                                           const PREDICTION_MODE this_mode);
 
-int av1_compound_single_motion_search_interinter(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
-    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
-
 int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize, MV *this_mv,
                                       const uint8_t *second_pred,
-- 
GitLab


From 826bc39afd8e709710964e4c85ac398b4004d8e9 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:27:37 -0700
Subject: [PATCH 366/391] picklpf.c: make av1_get_max_filter_level() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: I40b4e232c2ec2cfbaa91e15356ea05568202679d
---
 av1/encoder/picklpf.c | 6 +++---
 av1/encoder/picklpf.h | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 29a8abdfde..0851d97b5f 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -50,7 +50,7 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
   }
 }
 
-int av1_get_max_filter_level(const AV1_COMP *cpi) {
+static int get_max_filter_level(const AV1_COMP *cpi) {
   if (is_stat_consumption_stage_twopass(cpi)) {
     return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
                                                       : MAX_LOOP_FILTER;
@@ -105,7 +105,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                                int dir) {
   const AV1_COMMON *const cm = &cpi->common;
   const int min_filter_level = 0;
-  const int max_filter_level = av1_get_max_filter_level(cpi);
+  const int max_filter_level = get_max_filter_level(cpi);
   int filt_direction = 0;
   int64_t best_err;
   int filt_best;
@@ -236,7 +236,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     lf->filter_level[1] = 0;
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
-    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int max_filter_level = get_max_filter_level(cpi);
     const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
                                    seq_params->bit_depth);
     // based on tests result for rtc test set
diff --git a/av1/encoder/picklpf.h b/av1/encoder/picklpf.h
index 36f35bdc3b..023b2ab46d 100644
--- a/av1/encoder/picklpf.h
+++ b/av1/encoder/picklpf.h
@@ -20,7 +20,6 @@ extern "C" {
 
 struct yv12_buffer_config;
 struct AV1_COMP;
-int av1_get_max_filter_level(const AV1_COMP *cpi);
 
 /*!\brief Algorithm for AV1 loop filter level selection.
  *
-- 
GitLab


From 911a06dd1234a0f87507357bf378ca9d1cdb67d1 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:28:49 -0700
Subject: [PATCH 367/391] ratectrl.c: make some functions static

av1_get_bpmb_enumerator() and av1_frame_type_qdelta() are unused outside
of this file.

+ remove av1_get_arf_q_index(); the last reference to this function was
  removed in:
  f38a7d5533 Add bitrate accuracy estimation.
+ remove reference to av1_get_arf_q_index_q_mode(); this function was
  removed in:
  45c902bb8f Integrate BITRATE_ACCURACY with new ARF q feature

Bug: aomedia:3416,aomedia:3045
Change-Id: I8c693d63152ec6c38381c35377da2a0daa5ffe7c
---
 av1/encoder/ratectrl.c | 23 +++++++----------------
 av1/encoder/ratectrl.h | 30 ------------------------------
 2 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index b8c25387d2..f1891bfef7 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -164,8 +164,9 @@ double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
   }
 }
 
-int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
-                            const int is_screen_content_type) {
+// Gets the appropriate bpmb enumerator based on the frame and content type
+static int get_bpmb_enumerator(FRAME_TYPE frame_type,
+                               const int is_screen_content_type) {
   int enumerator;
 
   if (is_screen_content_type) {
@@ -218,7 +219,7 @@ int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
   const int is_screen_content_type = cpi->is_screen_content_type;
   const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
   const double q = av1_convert_qindex_to_q(qindex, bit_depth);
-  int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type);
+  int enumerator = get_bpmb_enumerator(frame_type, is_screen_content_type);
 
   assert(correction_factor <= MAX_BPB_FACTOR &&
          correction_factor >= MIN_BPB_FACTOR);
@@ -1627,7 +1628,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
 static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
                                                              1.50, 1.25, 1.15,
                                                              1.0 };
-int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+static int frame_type_qdelta(const AV1_COMP *cpi, int q) {
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const RATE_FACTOR_LEVEL rf_lvl =
       get_rate_factor_level(gf_group, cpi->gf_frame_index);
@@ -1812,7 +1813,7 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
   // Static forced key frames Q restrictions dealt with elsewhere.
   if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
       (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
-    const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
+    const int qdelta = frame_type_qdelta(cpi, active_worst_quality);
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
   }
@@ -1992,16 +1993,6 @@ int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
   return active_best_quality;
 }
 
-// Returns the q_index for the ARF in the GOP.
-int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
-                        double arf_boost_factor) {
-  int active_best_quality =
-      get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
-  const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
-  const int boost = min_boost - active_best_quality;
-  return min_boost - (int)(boost * arf_boost_factor);
-}
-
 static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
                                        int height, int gf_index,
                                        int *bottom_index, int *top_index) {
@@ -3866,7 +3857,7 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
   // and qp (==max_QP). This comes from the inverse computation of
   // av1_rc_bits_per_mb().
   q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
-  enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
+  enumerator = get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
   new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
   if (new_correction_factor > rate_correction_factor) {
     rate_correction_factor =
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 69aad47201..21c7568d9e 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -686,10 +686,6 @@ int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
                       int width, int height);
 
 /*!\cond */
-// Gets the appropriate bpmb ennumerator based on the frame and content type
-int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
-                            const int is_screen_content_type);
-
 // Estimates bits per mb for a given qindex and correction factor.
 int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
                        int qindex, double correction_factor,
@@ -719,8 +715,6 @@ int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi,
                                FRAME_TYPE frame_type, int qindex,
                                double rate_target_ratio);
 
-int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
-
 void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
 
 void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
@@ -857,30 +851,6 @@ int av1_postencode_drop_cbr(struct AV1_COMP *cpi, size_t *size);
 int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
                            int gf_pyramid_level, int arf_q);
 
-/*!\brief Compute the q_indices for the ARF of a GOP.
- *
- * \param[in]       base_q_index      Base q index
- * \param[in]       gfu_boost         GFU boost
- * \param[in]       bit_depth         Bit depth
- * \param[in]       arf_boost_factor  ARF boost factor
- *
- * \return Returns the q_index for the ARF frame.
- */
-int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
-                        double arf_boost_factor);
-
-#if !CONFIG_REALTIME_ONLY
-struct TplDepFrame;
-/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
- *
- * \param[in]       cpi               Top level encoder structure
- * \param[in]       tpl_frame         Tpl Frame stats
- *
- * \return Returns the q_index for the ARF frame.
- */
-int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
-                               struct TplDepFrame *tpl_frame);
-#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-- 
GitLab


From 4fc16d157f4f2f7e2abbf0311a23c9e39deebc49 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:31:18 -0700
Subject: [PATCH 368/391] temporal_filter.c: make av1_get_q() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: Idd5383abed35064ea24bdd6722743124acc3b3da
---
 av1/encoder/temporal_filter.c | 25 +++++++++++++------------
 av1/encoder/temporal_filter.h |  3 ---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 0fdbe647a1..dd42f72833 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -84,6 +84,16 @@ static inline void get_log_var_4x4sub_blk(
   *blk_4x4_var_max = log1p(var_max / 16.0);
 }
 
+// Helper function to get `q` used for encoding.
+static int get_q(const AV1_COMP *cpi) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int q =
+      (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+                                   cpi->common.seq_params->bit_depth);
+  return q;
+}
+
 /*!\endcond */
 /*!\brief Does motion search for blocks in temporal filtering. This is
  *  the first step for temporal filtering. More specifically, given a frame to
@@ -188,7 +198,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
   FULLPEL_MV_STATS best_mv_stats;
   int block_mse = INT_MAX;
   MV block_mv = kZeroMv;
-  const int q = av1_get_q(cpi);
+  const int q = get_q(cpi);
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
                                      &baseline_mv, start_mv, search_site_cfg,
@@ -848,15 +858,6 @@ static void tf_normalize_filtered_frame(
   }
 }
 
-int av1_get_q(const AV1_COMP *cpi) {
-  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
-  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
-  const int q =
-      (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
-                                   cpi->common.seq_params->bit_depth);
-  return q;
-}
-
 void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
@@ -1090,7 +1091,7 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi,
                            num_planes - 1, cpi->common.seq_params->bit_depth,
                            NOISE_ESTIMATION_EDGE_THRESHOLD);
   // Get quantization factor.
-  const int q = av1_get_q(cpi);
+  const int q = get_q(cpi);
   // Get correlation estimates from first-pass;
   const FIRSTPASS_STATS *stats =
       cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
@@ -1377,7 +1378,7 @@ static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
   tf_ctx->mb_rows = mb_rows;
   tf_ctx->mb_cols = mb_cols;
   tf_ctx->is_highbitdepth = is_highbitdepth;
-  tf_ctx->q_factor = av1_get_q(cpi);
+  tf_ctx->q_factor = get_q(cpi);
 }
 
 int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 36ecc2f10a..9585942c32 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -351,9 +351,6 @@ int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
                                   aom_bit_depth_t bit_depth);
 
 /*!\cond */
-// Helper function to get `q` used for encoding.
-int av1_get_q(const struct AV1_COMP *cpi);
-
 // Allocates memory for members of TemporalFilterData.
 // Inputs:
 //   tf_data: Pointer to the structure containing temporal filter related data.
-- 
GitLab


From c98f2f04081914f818f230a2ab6b7364f5922756 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:11:49 -0700
Subject: [PATCH 369/391] remove av1_get_hier_tpl_rdmult()

The last use of this function was removed in:
b7e684ea5a Update coding block level rd multiplier

Bug: aomedia:3416
Change-Id: Ia43c9e2af06a013f354de32c7a91c5c7c9225df8
---
 av1/encoder/encodeframe_utils.c | 80 +--------------------------------
 av1/encoder/encodeframe_utils.h |  4 --
 2 files changed, 1 insertion(+), 83 deletions(-)

diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index f66cdcc135..864caa6b20 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -83,28 +83,8 @@ void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
 }
 #endif
 
-// TODO(angiebird): Move these function to tpl_model.c
+// TODO(angiebird): Move this function to tpl_model.c
 #if !CONFIG_REALTIME_ONLY
-// Return the end column for the current superblock, in unit of TPL blocks.
-static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
-                                         int num_mi_w) {
-  // Find the start column of this superblock.
-  const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
-                              << cm->seq_params->mib_size_log2;
-  // Same but in superres upscaled dimension.
-  const int sb_mi_col_start_sr =
-      coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
-  // Width of this superblock in mi units.
-  const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
-  // Same but in superres upscaled dimension.
-  const int sb_mi_width_sr =
-      coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
-  // Superblock end in mi units.
-  const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
-  // Superblock end in TPL units.
-  return (sb_mi_end + num_mi_w - 1) / num_mi_w;
-}
-
 int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                       const BLOCK_SIZE bsize, const int mi_row,
                       const int mi_col) {
@@ -157,64 +137,6 @@ int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
   return AOMMAX(deltaq_rdmult, 1);
 }
-
-int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            const BLOCK_SIZE bsize, const int mi_row,
-                            const int mi_col, int orig_rdmult) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
-  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
-  const int tpl_idx = cpi->gf_frame_index;
-  const int deltaq_rdmult = set_rdmult(cpi, x, -1);
-  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
-  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
-    return deltaq_rdmult;
-  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
-
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  const int block_mi_width_sr =
-      coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
-
-  const BLOCK_SIZE bsize_base = BLOCK_16X16;
-  const int num_mi_w = mi_size_wide[bsize_base];
-  const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
-  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
-  // This is required because the end col of superblock may be off by 1 in case
-  // of superres.
-  const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
-  int row, col;
-  double base_block_count = 0.0;
-  double geom_mean_of_scale = 0.0;
-  for (row = mi_row / num_mi_w;
-       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col_sr / num_mi_h;
-         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
-         col < sb_bcol_end;
-         ++col) {
-      const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
-      base_block_count += 1.0;
-    }
-  }
-  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
-  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
-  rdmult = AOMMAX(rdmult, 0);
-  av1_set_error_per_bit(&x->errorperbit, rdmult);
-#if !CONFIG_RD_COMMAND
-  if (bsize == cm->seq_params->sb_size) {
-    const int rdmult_sb = set_rdmult(cpi, x, -1);
-    assert(rdmult_sb == rdmult);
-    (void)rdmult_sb;
-  }
-#endif  // !CONFIG_RD_COMMAND
-  return rdmult;
-}
 #endif  // !CONFIG_REALTIME_ONLY
 
 static inline void update_filter_type_count(FRAME_COUNTS *counts,
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 6b6efac297..05afd61ad1 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -356,10 +356,6 @@ int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
 int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                       const BLOCK_SIZE bsize, const int mi_row,
                       const int mi_col);
-
-int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            const BLOCK_SIZE bsize, const int mi_row,
-                            const int mi_col, int orig_rdmult);
 #endif  // !CONFIG_REALTIME_ONLY
 
 void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
-- 
GitLab


From 83f00dc6252b6ac08ea5e73b608087a1857e4214 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 17:25:22 -0700
Subject: [PATCH 370/391] mcomp.c: remove av1_get_mvpred_compound_var()

The last reference to this function was removed in:
aa434ccf42 Remove redundant av1_get_mvpred_compound_var calls

Bug: aomedia:3416
Change-Id: Ia320366867c13b0fa1a302ab1f4598043595bba8
---
 av1/encoder/mcomp.c | 44 --------------------------------------------
 av1/encoder/mcomp.h |  7 -------
 2 files changed, 51 deletions(-)

diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index a66a1e4cf8..1a5e5c5317 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -3951,47 +3951,3 @@ int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
 
   return sse + mv_err_cost_(&mv, mv_cost_params);
 }
-
-static inline int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
-                                    const FULLPEL_MV best_mv,
-                                    const uint8_t *second_pred,
-                                    const aom_variance_fn_ptr_t *vfp,
-                                    const struct buf_2d *src,
-                                    const struct buf_2d *pre) {
-  const MV mv = get_mv_from_fullmv(&best_mv);
-  unsigned int unused;
-
-  return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
-                   src->buf, src->stride, &unused, second_pred) +
-         mv_err_cost_(&mv, mv_cost_params);
-}
-
-static inline int get_mvpred_mask_var(
-    const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
-    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
-    int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
-    const struct buf_2d *pre) {
-  const MV mv = get_mv_from_fullmv(&best_mv);
-  unsigned int unused;
-
-  return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
-                   src->buf, src->stride, second_pred, mask, mask_stride,
-                   invert_mask, &unused) +
-         mv_err_cost_(&mv, mv_cost_params);
-}
-
-int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
-                                const FULLPEL_MV best_mv,
-                                const uint8_t *second_pred, const uint8_t *mask,
-                                int mask_stride, int invert_mask,
-                                const aom_variance_fn_ptr_t *vfp,
-                                const struct buf_2d *src,
-                                const struct buf_2d *pre) {
-  if (mask) {
-    return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
-                               mask_stride, invert_mask, vfp, src, pre);
-  } else {
-    return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
-                             pre);
-  }
-}
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 6c3e9fbee5..f55d8690f2 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -56,13 +56,6 @@ int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
                        const FULLPEL_MV best_mv,
                        const aom_variance_fn_ptr_t *vfp,
                        const struct buf_2d *src, const struct buf_2d *pre);
-int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
-                                const FULLPEL_MV best_mv,
-                                const uint8_t *second_pred, const uint8_t *mask,
-                                int mask_stride, int invert_mask,
-                                const aom_variance_fn_ptr_t *vfp,
-                                const struct buf_2d *src,
-                                const struct buf_2d *pre);
 
 // =============================================================================
 //  Motion Search
-- 
GitLab


From b0f8fecae01ad7890d8a95c4e6c166572d6511ef Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 18:51:13 -0700
Subject: [PATCH 371/391] remove aom_scale.[ch] & gen_scalers.c

The last reference to aom_scale_frame() was removed in:
3897e2df73 Simple upscaler w/ frame superres + loop-rest

This also removes `CONFIG_SPATIAL_RESAMPLING` which is a carryover from
VP8 and not used in the code.

Bug: aomedia:3416
Change-Id: I3c5e6b4eeed0bb2d4dbe2c235798e3d8d0d1f922
---
 aom_scale/aom_scale.cmake             |   5 +-
 aom_scale/aom_scale.h                 |  23 --
 aom_scale/aom_scale_rtcd.pl           |  11 -
 aom_scale/generic/aom_scale.c         | 506 --------------------------
 aom_scale/generic/gen_scalers.c       | 201 ----------
 av1/common/resize.c                   |   1 -
 av1/decoder/decodeframe.c             |   1 -
 av1/decoder/decoder.c                 |   1 -
 av1/encoder/encoder.c                 |   1 -
 av1/encoder/firstpass.c               |   1 -
 av1/encoder/temporal_filter.c         |   1 -
 build/cmake/aom_config_defaults.cmake |   1 -
 12 files changed, 1 insertion(+), 752 deletions(-)
 delete mode 100644 aom_scale/aom_scale.h
 delete mode 100644 aom_scale/generic/aom_scale.c
 delete mode 100644 aom_scale/generic/gen_scalers.c

diff --git a/aom_scale/aom_scale.cmake b/aom_scale/aom_scale.cmake
index 3fe7fb752a..7ad4214bc1 100644
--- a/aom_scale/aom_scale.cmake
+++ b/aom_scale/aom_scale.cmake
@@ -13,10 +13,7 @@ if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
 endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
 set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1)
 
-list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
-            "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
-            "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
-            "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/generic/yv12config.c"
             "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
             "${AOM_ROOT}/aom_scale/yv12config.h")
 
diff --git a/aom_scale/aom_scale.h b/aom_scale/aom_scale.h
deleted file mode 100644
index 4411397bb2..0000000000
--- a/aom_scale/aom_scale.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_SCALE_AOM_SCALE_H_
-#define AOM_AOM_SCALE_AOM_SCALE_H_
-
-#include "aom_scale/yv12config.h"
-
-extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
-                            unsigned char *temp_area, unsigned char temp_height,
-                            unsigned int hscale, unsigned int hratio,
-                            unsigned int vscale, unsigned int vratio,
-                            unsigned int interlaced, const int num_planes);
-
-#endif  // AOM_AOM_SCALE_AOM_SCALE_H_
diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index 2988383c8d..273e3f9cc1 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl
@@ -17,17 +17,6 @@ EOF
 }
 forward_decls qw/aom_scale_forward_decls/;
 
-# Scaler functions
-if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
-  add_proto qw/void aom_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-  add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
-  add_proto qw/void aom_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-  add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
-  add_proto qw/void aom_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-  add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
-  add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
-}
-
 add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes";
 
 add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
deleted file mode 100644
index 85ae1e2c43..0000000000
--- a/aom_scale/generic/aom_scale.c
+++ /dev/null
@@ -1,506 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/****************************************************************************
- *
- *   Module Title :     scale.c
- *
- *   Description  :     Image scaling functions.
- *
- ***************************************************************************/
-
-/****************************************************************************
- *  Header Files
- ****************************************************************************/
-#include "config/aom_scale_rtcd.h"
-
-#include "aom_mem/aom_mem.h"
-#include "aom_scale/aom_scale.h"
-#include "aom_scale/yv12config.h"
-
-typedef struct {
-  int expanded_frame_width;
-  int expanded_frame_height;
-
-  int HScale;
-  int HRatio;
-  int VScale;
-  int VRatio;
-
-  YV12_BUFFER_CONFIG *src_yuv_config;
-  YV12_BUFFER_CONFIG *dst_yuv_config;
-
-} SCALE_VARS;
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_i
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step             : Number of pixels to step on
- *                                                in source.
- *                  unsigned int source_scale   : Scale for source (UNUSED).
- *                  unsigned int source_length  : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step               : Number of pixels to step on
- *                                                in destination.
- *                  unsigned int dest_scale     : Scale for destination
- *                                                (UNUSED).
- *                  unsigned int dest_length    : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 interpolated scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static void scale1d_2t1_i(const unsigned char *source, int source_step,
-                          unsigned int source_scale, unsigned int source_length,
-                          unsigned char *dest, int dest_step,
-                          unsigned int dest_scale, unsigned int dest_length) {
-  const unsigned char *const dest_end = dest + dest_length * dest_step;
-  (void)source_length;
-  (void)source_scale;
-  (void)dest_scale;
-
-  source_step *= 2;  // Every other row.
-
-  dest[0] = source[0];  // Special case: 1st pixel.
-  source += source_step;
-  dest += dest_step;
-
-  while (dest < dest_end) {
-    const unsigned int a = 3 * source[-source_step];
-    const unsigned int b = 10 * source[0];
-    const unsigned int c = 3 * source[source_step];
-    *dest = (unsigned char)((8 + a + b + c) >> 4);
-    source += source_step;
-    dest += dest_step;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_ps
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step             : Number of pixels to step on
- *                                                in source.
- *                  unsigned int source_scale   : Scale for source (UNUSED).
- *                  unsigned int source_length  : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step               : Number of pixels to step on
- *                                                in destination.
- *                  unsigned int dest_scale     : Scale for destination
- *                                                (UNUSED).
- *                  unsigned int dest_length    : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static void scale1d_2t1_ps(const unsigned char *source, int source_step,
-                           unsigned int source_scale,
-                           unsigned int source_length, unsigned char *dest,
-                           int dest_step, unsigned int dest_scale,
-                           unsigned int dest_length) {
-  const unsigned char *const dest_end = dest + dest_length * dest_step;
-  (void)source_length;
-  (void)source_scale;
-  (void)dest_scale;
-
-  source_step *= 2;  // Every other row.
-
-  while (dest < dest_end) {
-    *dest = *source;
-    source += source_step;
-    dest += dest_step;
-  }
-}
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step             : Number of pixels to step on
- *                                                in source.
- *                  unsigned int source_scale   : Scale for source.
- *                  unsigned int source_length  : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step               : Number of pixels to step on
- *                                                in destination.
- *                  unsigned int dest_scale     : Scale for destination.
- *                  unsigned int dest_length    : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs linear interpolation in one dimension.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static void scale1d_c(const unsigned char *source, int source_step,
-                      unsigned int source_scale, unsigned int source_length,
-                      unsigned char *dest, int dest_step,
-                      unsigned int dest_scale, unsigned int dest_length) {
-  const unsigned char *const dest_end = dest + dest_length * dest_step;
-  const unsigned int round_value = dest_scale / 2;
-  unsigned int left_modifier = dest_scale;
-  unsigned int right_modifier = 0;
-  unsigned char left_pixel = source[0];
-  unsigned char right_pixel = source[source_step];
-
-  (void)source_length;
-
-  /* These asserts are needed if there are boundary issues... */
-  /* assert ( dest_scale > source_scale );*/
-  /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
-   * source_scale);*/
-
-  while (dest < dest_end) {
-    *dest = (unsigned char)((left_modifier * left_pixel +
-                             right_modifier * right_pixel + round_value) /
-                            dest_scale);
-
-    right_modifier += source_scale;
-
-    while (right_modifier > dest_scale) {
-      right_modifier -= dest_scale;
-      source += source_step;
-      left_pixel = source[0];
-      right_pixel = source[source_step];
-    }
-
-    left_modifier = dest_scale - right_modifier;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : Scale2D
- *
- *  INPUTS        : const unsigned char *source    : Pointer to data to be
- *                                                   scaled.
- *                  int source_pitch               : Stride of source image.
- *                  unsigned int source_width      : Width of input image.
- *                  unsigned int source_height     : Height of input image.
- *                  unsigned char *dest            : Pointer to output data
- *                                                   array.
- *                  int dest_pitch                 : Stride of destination
- *                                                   image.
- *                  unsigned int dest_width        : Width of destination image.
- *                  unsigned int dest_height       : Height of destination
- *                                                   image.
- *                  unsigned char *temp_area       : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale            : Horizontal scale factor
- *                                                   numerator.
- *                  unsigned int hratio            : Horizontal scale factor
- *                                                   denominator.
- *                  unsigned int vscale            : Vertical scale factor
- *                                                   numerator.
- *                  unsigned int vratio            : Vertical scale factor
- *                                                   denominator.
- *                  unsigned int interlaced        : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-static void Scale2D(
-    /*const*/
-    unsigned char *source, int source_pitch, unsigned int source_width,
-    unsigned int source_height, unsigned char *dest, int dest_pitch,
-    unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
-    unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
-    unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
-  unsigned int i, j, k;
-  unsigned int bands;
-  unsigned int dest_band_height;
-  unsigned int source_band_height;
-
-  typedef void (*Scale1D)(const unsigned char *source, int source_step,
-                          unsigned int source_scale, unsigned int source_length,
-                          unsigned char *dest, int dest_step,
-                          unsigned int dest_scale, unsigned int dest_length);
-
-  Scale1D Scale1Dv = scale1d_c;
-  Scale1D Scale1Dh = scale1d_c;
-
-  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *,
-                           unsigned int) = NULL;
-  void (*vert_band_scale)(unsigned char *, int, unsigned char *, int,
-                          unsigned int) = NULL;
-
-  int ratio_scalable = 1;
-  int interpolation = 0;
-
-  unsigned char *source_base;
-  unsigned char *line_src;
-
-  source_base = (unsigned char *)source;
-
-  if (source_pitch < 0) {
-    int offset;
-
-    offset = (source_height - 1);
-    offset *= source_pitch;
-
-    source_base += offset;
-  }
-
-  /* find out the ratio for each direction */
-  switch (hratio * 10 / hscale) {
-    case 8:
-      /* 4-5 Scale in Width direction */
-      horiz_line_scale = aom_horizontal_line_5_4_scale;
-      break;
-    case 6:
-      /* 3-5 Scale in Width direction */
-      horiz_line_scale = aom_horizontal_line_5_3_scale;
-      break;
-    case 5:
-      /* 1-2 Scale in Width direction */
-      horiz_line_scale = aom_horizontal_line_2_1_scale;
-      break;
-    default:
-      /* The ratio is not acceptable now */
-      /* throw("The ratio is not acceptable for now!"); */
-      ratio_scalable = 0;
-      break;
-  }
-
-  switch (vratio * 10 / vscale) {
-    case 8:
-      /* 4-5 Scale in vertical direction */
-      vert_band_scale = aom_vertical_band_5_4_scale;
-      source_band_height = 5;
-      dest_band_height = 4;
-      break;
-    case 6:
-      /* 3-5 Scale in vertical direction */
-      vert_band_scale = aom_vertical_band_5_3_scale;
-      source_band_height = 5;
-      dest_band_height = 3;
-      break;
-    case 5:
-      /* 1-2 Scale in vertical direction */
-
-      if (interlaced) {
-        /* if the content is interlaced, point sampling is used */
-        vert_band_scale = aom_vertical_band_2_1_scale;
-      } else {
-        interpolation = 1;
-        /* if the content is progressive, interplo */
-        vert_band_scale = aom_vertical_band_2_1_scale_i;
-      }
-
-      source_band_height = 2;
-      dest_band_height = 1;
-      break;
-    default:
-      /* The ratio is not acceptable now */
-      /* throw("The ratio is not acceptable for now!"); */
-      ratio_scalable = 0;
-      break;
-  }
-
-  if (ratio_scalable) {
-    if (source_height == dest_height) {
-      /* for each band of the image */
-      for (k = 0; k < dest_height; ++k) {
-        horiz_line_scale(source, source_width, dest, dest_width);
-        source += source_pitch;
-        dest += dest_pitch;
-      }
-
-      return;
-    }
-
-    if (interpolation) {
-      if (source < source_base) source = source_base;
-
-      horiz_line_scale(source, source_width, temp_area, dest_width);
-    }
-
-    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
-         ++k) {
-      /* scale one band horizontally */
-      for (i = 0; i < source_band_height; ++i) {
-        /* Trap case where we could read off the base of the source buffer */
-
-        line_src = source + i * source_pitch;
-
-        if (line_src < source_base) line_src = source_base;
-
-        horiz_line_scale(line_src, source_width,
-                         temp_area + (i + 1) * dest_pitch, dest_width);
-      }
-
-      /* Vertical scaling is in place */
-      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch,
-                      dest_width);
-
-      if (interpolation)
-        memcpy(temp_area, temp_area + source_band_height * dest_pitch,
-               dest_width);
-
-      /* Next band... */
-      source += (unsigned long)source_band_height * source_pitch;
-      dest += (unsigned long)dest_band_height * dest_pitch;
-    }
-
-    return;
-  }
-
-  if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps;
-
-  if (vscale == 2 && vratio == 1) {
-    if (interlaced)
-      Scale1Dv = scale1d_2t1_ps;
-    else
-      Scale1Dv = scale1d_2t1_i;
-  }
-
-  if (source_height == dest_height) {
-    /* for each band of the image */
-    for (k = 0; k < dest_height; ++k) {
-      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
-               dest_width);
-      source += source_pitch;
-      dest += dest_pitch;
-    }
-
-    return;
-  }
-
-  if (dest_height > source_height) {
-    dest_band_height = temp_area_height - 1;
-    source_band_height = dest_band_height * source_height / dest_height;
-  } else {
-    source_band_height = temp_area_height - 1;
-    dest_band_height = source_band_height * vratio / vscale;
-  }
-
-  /* first row needs to be done so that we can stay one row ahead for vertical
-   * zoom */
-  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio,
-           dest_width);
-
-  /* for each band of the image */
-  bands = (dest_height + dest_band_height - 1) / dest_band_height;
-
-  for (k = 0; k < bands; ++k) {
-    /* scale one band horizontally */
-    for (i = 1; i < source_band_height + 1; ++i) {
-      if (k * source_band_height + i < source_height) {
-        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
-                 temp_area + i * dest_pitch, 1, hratio, dest_width);
-      } else { /*  Duplicate the last row */
-        /* copy temp_area row 0 over from last row in the past */
-        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch,
-               dest_pitch);
-      }
-    }
-
-    /* scale one band vertically */
-    for (j = 0; j < dest_width; ++j) {
-      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
-               &dest[j], dest_pitch, vratio, dest_band_height);
-    }
-
-    /* copy temp_area row 0 over from last row in the past */
-    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
-
-    /* move to the next band */
-    source += source_band_height * source_pitch;
-    dest += dest_band_height * dest_pitch;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : aom_scale_frame
- *
- *  INPUTS        : YV12_BUFFER_CONFIG *src        : Pointer to frame to be
- *                                                   scaled.
- *                  YV12_BUFFER_CONFIG *dst        : Pointer to buffer to hold
- *                                                   scaled frame.
- *                  unsigned char *temp_area       : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale            : Horizontal scale factor
- *                                                   numerator.
- *                  unsigned int hratio            : Horizontal scale factor
- *                                                   denominator.
- *                  unsigned int vscale            : Vertical scale factor
- *                                                   numerator.
- *                  unsigned int vratio            : Vertical scale factor
- *                                                   denominator.
- *                  unsigned int interlaced        : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
-                     unsigned char *temp_area, unsigned char temp_height,
-                     unsigned int hscale, unsigned int hratio,
-                     unsigned int vscale, unsigned int vratio,
-                     unsigned int interlaced, const int num_planes) {
-  const int dw = (hscale - 1 + src->y_width * hratio) / hscale;
-  const int dh = (vscale - 1 + src->y_height * vratio) / vscale;
-
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const int is_uv = plane > 0;
-    const int plane_dw = dw >> is_uv;
-    const int plane_dh = dh >> is_uv;
-
-    Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv],
-            src->widths[is_uv], src->heights[is_uv],
-            (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw,
-            plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio,
-            interlaced);
-
-    if (plane_dw < dst->widths[is_uv])
-      for (int i = 0; i < plane_dh; ++i)
-        memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1,
-               dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2],
-               dst->widths[is_uv] - plane_dw + 1);
-
-    if (plane_dh < dst->heights[is_uv])
-      for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i)
-        memcpy(dst->buffers[plane] + i * dst->strides[is_uv],
-               dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv],
-               dst->widths[is_uv] + 1);
-  }
-}
diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
deleted file mode 100644
index 6c8df70d96..0000000000
--- a/aom_scale/generic/gen_scalers.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_scale_rtcd.h"
-
-#include "aom_scale/aom_scale.h"
-#include "aom_mem/aom_mem.h"
-/****************************************************************************
- *  Imports
- ****************************************************************************/
-
-/****************************************************************************
- *
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width   : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width     : Stride of destination
- *                                                (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void aom_horizontal_line_5_4_scale_c(const unsigned char *source,
-                                     unsigned int source_width,
-                                     unsigned char *dest,
-                                     unsigned int dest_width) {
-  const unsigned char *const source_end = source + source_width;
-  (void)dest_width;
-
-  while (source < source_end) {
-    const unsigned int a = source[0];
-    const unsigned int b = source[1];
-    const unsigned int c = source[2];
-    const unsigned int d = source[3];
-    const unsigned int e = source[4];
-
-    dest[0] = (unsigned char)a;
-    dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
-
-    source += 5;
-    dest += 4;
-  }
-}
-
-void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch,
-                                   unsigned char *dest, int dest_pitch,
-                                   unsigned int dest_width) {
-  const unsigned char *const dest_end = dest + dest_width;
-  while (dest < dest_end) {
-    const unsigned int a = source[0 * src_pitch];
-    const unsigned int b = source[1 * src_pitch];
-    const unsigned int c = source[2 * src_pitch];
-    const unsigned int d = source[3 * src_pitch];
-    const unsigned int e = source[4 * src_pitch];
-
-    dest[0 * dest_pitch] = (unsigned char)a;
-    dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
-
-    ++source;
-    ++dest;
-  }
-}
-
-/*7***************************************************************************
- *
- *  ROUTINE       : aom_horizontal_line_3_5_scale_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width   : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width     : Stride of destination
- *                                                (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 3 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- *
- ****************************************************************************/
-void aom_horizontal_line_5_3_scale_c(const unsigned char *source,
-                                     unsigned int source_width,
-                                     unsigned char *dest,
-                                     unsigned int dest_width) {
-  const unsigned char *const source_end = source + source_width;
-  (void)dest_width;
-  while (source < source_end) {
-    const unsigned int a = source[0];
-    const unsigned int b = source[1];
-    const unsigned int c = source[2];
-    const unsigned int d = source[3];
-    const unsigned int e = source[4];
-
-    dest[0] = (unsigned char)a;
-    dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    source += 5;
-    dest += 3;
-  }
-}
-
-void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch,
-                                   unsigned char *dest, int dest_pitch,
-                                   unsigned int dest_width) {
-  const unsigned char *const dest_end = dest + dest_width;
-  while (dest < dest_end) {
-    const unsigned int a = source[0 * src_pitch];
-    const unsigned int b = source[1 * src_pitch];
-    const unsigned int c = source[2 * src_pitch];
-    const unsigned int d = source[3 * src_pitch];
-    const unsigned int e = source[4 * src_pitch];
-
-    dest[0 * dest_pitch] = (unsigned char)a;
-    dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    ++source;
-    ++dest;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : aom_horizontal_line_1_2_scale_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width   : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width     : Stride of destination
- *                                                (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 1 to 2.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void aom_horizontal_line_2_1_scale_c(const unsigned char *source,
-                                     unsigned int source_width,
-                                     unsigned char *dest,
-                                     unsigned int dest_width) {
-  const unsigned char *const source_end = source + source_width;
-  (void)dest_width;
-  while (source < source_end) {
-    dest[0] = source[0];
-    source += 2;
-    ++dest;
-  }
-}
-
-void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch,
-                                   unsigned char *dest, int dest_pitch,
-                                   unsigned int dest_width) {
-  (void)dest_pitch;
-  (void)src_pitch;
-  memcpy(dest, source, dest_width);
-}
-
-void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch,
-                                     unsigned char *dest, int dest_pitch,
-                                     unsigned int dest_width) {
-  const unsigned char *const dest_end = dest + dest_width;
-  (void)dest_pitch;
-  while (dest < dest_end) {
-    const unsigned int a = source[-src_pitch] * 3;
-    const unsigned int b = source[0] * 10;
-    const unsigned int c = source[src_pitch] * 3;
-    dest[0] = (unsigned char)((8 + a + b + c) >> 4);
-    ++source;
-    ++dest;
-  }
-}
diff --git a/av1/common/resize.c b/av1/common/resize.c
index d3e3850a19..839a7e3ea6 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -23,7 +23,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/flow_estimation/corner_detect.h"
 #include "aom_ports/mem.h"
-#include "aom_scale/aom_scale.h"
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index ccfaad2ac7..064a2aea42 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -28,7 +28,6 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/mem_ops.h"
-#include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
 #include "aom_util/aom_pthread.h"
 #include "aom_util/aom_thread.h"
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 9edd34abd6..0ae0b86dbf 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -20,7 +20,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
-#include "aom_scale/aom_scale.h"
 #include "aom_util/aom_pthread.h"
 #include "aom_util/aom_thread.h"
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 71c6e27bfa..60be4c8a77 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -36,7 +36,6 @@
 #endif
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_scale/aom_scale.h"
 #include "aom_util/aom_pthread.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 8bbf79d3f9..a81cc29ba3 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -20,7 +20,6 @@
 #include "aom_dsp/variance.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
 #include "aom_util/aom_pthread.h"
 
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index dd42f72833..dde33de8f6 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -22,7 +22,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_scale/aom_scale.h"
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/quant_common.h"
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index ae0842c6e9..cacd7124c0 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -119,7 +119,6 @@ set_aom_config_var(
   CONFIG_NORMAL_TILE_MODE 0
   "Only enables general decoding (disables large scale tile decoding).")
 set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.")
-set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.")
 set_aom_config_var(CONFIG_TUNE_BUTTERAUGLI 0
                    "Enable encoding tuning for Butteraugli.")
 set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
-- 
GitLab


From 01844f7e99297622ced4f54f7197e69892032962 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 14 Aug 2024 15:06:42 +0100
Subject: [PATCH 372/391] Add Arm Neon USMMLA impl. for 6-tap non-avg
 dist_wtd_convolve_x

Use a USMMLA implementation for the 6-tap and 4-tap non-averaging case
in av1_dist_wtd_convolve_x_neon_i8mm. The rationale is similar to
previous patches adding USMMLA code paths:

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

We use this new USMMLA 6-tap path for 4-tap filters as well since it
uses exactly the same number of instructions as the previous USDOT
implementation.

Change-Id: I513444a4ef26f1c98a05a39d555ba486dce5796a
---
 av1/common/arm/compound_convolve_neon_i8mm.c | 195 ++++++++++++++-----
 1 file changed, 142 insertions(+), 53 deletions(-)

diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index 0ed5c911a8..5b6d0c628e 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
@@ -204,7 +204,7 @@ static inline void dist_wtd_convolve_2d_horiz_8tap_neon_i8mm(
   const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
                                             (1 << ((ROUND0_BITS - 1) - 1)));
 
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
   // Filter values are even, so halve to reduce intermediate precision reqs.
   const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
@@ -405,7 +405,7 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
   int height = h;
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
     // 4-tap filters are used for blocks having width <= 4.
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter =
@@ -442,7 +442,7 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
       height -= 4;
     } while (height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
@@ -518,7 +518,7 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
   int height = h;
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
     // 4-tap filters are used for blocks having width <= 4.
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter =
@@ -555,7 +555,7 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
       height -= 4;
     } while (height != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
     // Filter values are even, so halve to reduce intermediate precision reqs.
     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
@@ -600,10 +600,47 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
   }
 }
 
-static inline void dist_wtd_convolve_x_neon_i8mm(
-    const uint8_t *src, int src_stride, int w, int h,
-    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
+static inline uint16x4_t convolve6_4_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
+                                       const uint8x16_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static inline uint16x8_t convolve6_8_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
+                                       const uint8x16x2_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter);
+  int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                               vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+  return vreinterpretq_u16_s16(res);
+}
+
+static inline void dist_wtd_convolve_x_6tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr) {
   assert(w % 4 == 0);
   assert(h % 4 == 0);
 
@@ -617,52 +654,39 @@ static inline void dist_wtd_convolve_x_neon_i8mm(
   const int32x4_t round_offset_shim = vdupq_n_s32(
       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
-
-    src_ptr += 2;
-
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
       uint16x4_t d0 =
-          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d1 =
-          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d2 =
-          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d3 =
-          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim);
 
-      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
 
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
     do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
+      const uint8_t *s = src;
+      uint16_t *d = dst;
       int width = w;
 
       do {
@@ -670,13 +694,13 @@ static inline void dist_wtd_convolve_x_neon_i8mm(
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
         uint16x8_t d0 =
-            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d1 =
-            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d2 =
-            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d3 =
-            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim);
 
         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -684,17 +708,72 @@ static inline void dist_wtd_convolve_x_neon_i8mm(
         d += 8;
         width -= 8;
       } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
+static inline void dist_wtd_convolve_x_8tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+  do {
+    const uint8_t *s = src;
+    uint16_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x8_t d0 =
+          convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d1 =
+          convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d2 =
+          convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d3 =
+          convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
+}
+
 void av1_dist_wtd_convolve_x_neon_i8mm(
     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params) {
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int filter_taps =
+      get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
   if (conv_params->do_average) {
     if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
       dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
@@ -706,7 +785,17 @@ void av1_dist_wtd_convolve_x_neon_i8mm(
                                         conv_params);
     }
   } else {
-    dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
-                                  subpel_x_qn, conv_params);
+    src -= (SUBPEL_TAPS / 2 - 1);
+
+    if (filter_taps < 8) {
+      dist_wtd_convolve_x_6tap_neon_i8mm(src + 1, src_stride, conv_params->dst,
+                                         conv_params->dst_stride, w, h,
+                                         x_filter_ptr);
+      return;
+    }
+
+    dist_wtd_convolve_x_8tap_neon_i8mm(src, src_stride, conv_params->dst,
+                                       conv_params->dst_stride, w, h,
+                                       x_filter_ptr);
   }
 }
-- 
GitLab


From 369781baa423e5d15ab4bd91fa3a0b247c158f12 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 14 Aug 2024 15:48:01 +0100
Subject: [PATCH 373/391] Add Arm Neon USMMLA impl. for 6-tap
 dist_wtd_convolve_x_avg

Use a USMMLA implementation for the 6-tap and 4-tap basic averaging
case in av1_dist_wtd_convolve_x_neon_i8mm. The rationale is similar to
previous patches adding USMMLA code paths:

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

We use this new USMMLA 6-tap path for 4-tap filters as well since it
uses exactly the same number of instructions as the previous USDOT
implementation.

Change-Id: Ia3563b57ad320d29df62040472e3eebabf210199
---
 av1/common/arm/compound_convolve_neon_i8mm.c | 225 ++++++++++++-------
 1 file changed, 140 insertions(+), 85 deletions(-)

diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index 5b6d0c628e..685f13fccb 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -488,10 +488,47 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
   }
 }
 
-static inline void dist_wtd_convolve_x_avg_neon_i8mm(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
+static inline uint16x4_t convolve6_4_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
+                                       const uint8x16_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static inline uint16x8_t convolve6_8_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
+                                       const uint8x16x2_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter);
+  int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                               vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+  return vreinterpretq_u16_s16(res);
+}
+
+static inline void dist_wtd_convolve_x_avg_6tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr) {
   assert(w % 4 == 0);
   assert(h % 4 == 0);
 
@@ -506,63 +543,49 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
   const int32x4_t round_offset_shim = vdupq_n_s32(
       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
-
-    src_ptr += 2;
-
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
       uint16x4_t d0 =
-          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d1 =
-          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d2 =
-          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d3 =
-          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim);
 
       uint16x4_t dd0, dd1, dd2, dd3;
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
       uint8x8_t d01_u8, d23_u8;
       compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
                             round_offset_vec, &d01_u8, &d23_u8);
 
-      store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
-      store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+      store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01_u8);
+      store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23_u8);
 
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
     do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
+      const uint8_t *s = src;
+      uint16_t *d = dst;
+      uint8_t *d_u8 = dst8;
       int width = w;
 
       do {
@@ -570,13 +593,13 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
         uint16x8_t d0 =
-            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d1 =
-            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d2 =
-            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d3 =
-            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim);
 
         uint16x8_t dd0, dd1, dd2, dd3;
         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -592,50 +615,73 @@ static inline void dist_wtd_convolve_x_avg_neon_i8mm(
         d_u8 += 8;
         width -= 8;
       } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
-static inline uint16x4_t convolve6_4_x(uint8x16_t samples,
-                                       const int8x16_t x_filter,
-                                       const uint8x16_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  // Permute samples ready for matrix multiply.
-  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+static inline void dist_wtd_convolve_x_avg_8tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
 
-  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
-  // (filter), destructively accumulating into the destination register.
-  int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter);
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
-  // We halved the convolution filter values so -1 from the right shift.
-  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-static inline uint16x8_t convolve6_8_x(uint8x16_t samples,
-                                       const int8x16_t x_filter,
-                                       const uint8x16x2_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  // Permute samples ready for matrix multiply.
-  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
-  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
-  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+  do {
+    const uint8_t *s = src;
+    uint16_t *d = dst;
+    uint8_t *d_u8 = dst8;
+    int width = w;
 
-  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
-  // (filter), destructively accumulating into the destination register.
-  int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter);
-  int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
-                               vshrn_n_s32(sum4567, ROUND0_BITS - 1));
-  return vreinterpretq_u16_s16(res);
+      uint16x8_t d0 =
+          convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d1 =
+          convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d2 =
+          convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d3 =
+          convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      uint16x8_t dd0, dd1, dd2, dd3;
+      load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+      compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+      store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+      s += 8;
+      d += 8;
+      d_u8 += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    dst8 += 4 * dst8_stride;
+    h -= 4;
+  } while (h != 0);
 }
 
 static inline void dist_wtd_convolve_x_6tap_neon_i8mm(
@@ -780,9 +826,18 @@ void av1_dist_wtd_convolve_x_neon_i8mm(
           src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
           subpel_x_qn, conv_params);
     } else {
-      dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
-                                        h, filter_params_x, subpel_x_qn,
-                                        conv_params);
+      src -= (SUBPEL_TAPS / 2 - 1);
+
+      if (filter_taps < 8) {
+        dist_wtd_convolve_x_avg_6tap_neon_i8mm(
+            src + 1, src_stride, conv_params->dst, conv_params->dst_stride,
+            dst8, dst8_stride, w, h, x_filter_ptr);
+        return;
+      }
+
+      dist_wtd_convolve_x_avg_8tap_neon_i8mm(src, src_stride, conv_params->dst,
+                                             conv_params->dst_stride, dst8,
+                                             dst8_stride, w, h, x_filter_ptr);
     }
   } else {
     src -= (SUBPEL_TAPS / 2 - 1);
-- 
GitLab


From f570a1c98b51baeaf1813644a636e5c413eb86dd Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 14 Aug 2024 17:43:12 +0100
Subject: [PATCH 374/391] Add Neon USMMLA impl. for 6-tap
 dist_wtd_convolve_x_dist_wtd_avg

Use a USMMLA implementation for the 6-tap and 4-tap dist_wtd averaging
case in av1_dist_wtd_convolve_x_neon_i8mm. The rationale is similar to
previous patches adding USMMLA code paths:

By permuting the input samples and the 6-tap filter we can use the
Armv8.6 I8MM USMMLA matrix multiply instructions to accelerate
horizontal 6-tap convolutions. The 2x8 by 8x2 matrix multiply
instruction does twice the work of a USDOT dot product instruction.

We use this new USMMLA 6-tap path for 4-tap filters as well since it
uses exactly the same number of instructions as the previous USDOT
implementation.

Change-Id: I5b294ea5e0e6712896b90cb9417d0b049d2c2c60
---
 av1/common/arm/compound_convolve_neon_i8mm.c | 234 +++++++++++--------
 1 file changed, 136 insertions(+), 98 deletions(-)

diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
index 685f13fccb..0589dfb153 100644
--- a/av1/common/arm/compound_convolve_neon_i8mm.c
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -328,21 +328,44 @@ void av1_dist_wtd_convolve_2d_neon_i8mm(
   }
 }
 
-static inline uint16x4_t convolve4_4_x(uint8x16_t samples,
-                                       const int8x8_t x_filter,
+static inline uint16x4_t convolve6_4_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
                                        const uint8x16_t permute_tbl,
                                        const int32x4_t round_offset) {
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
   uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  // First 4 output values.
-  int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter);
 
   // We halved the convolution filter values so -1 from the right shift.
   return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
 }
 
+static inline uint16x8_t convolve6_8_x(uint8x16_t samples,
+                                       const int8x16_t x_filter,
+                                       const uint8x16x2_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for matrix multiply.
+  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter);
+  int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
+                               vshrn_n_s32(sum4567, ROUND0_BITS - 1));
+  return vreinterpretq_u16_s16(res);
+}
+
 static inline uint16x8_t convolve8_8_x(uint8x16_t samples,
                                        const int8x8_t x_filter,
                                        const uint8x16x3_t permute_tbl,
@@ -372,10 +395,10 @@ static inline uint16x8_t convolve8_8_x(uint8x16_t samples,
   return vreinterpretq_u16_s16(res);
 }
 
-static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
+static inline void dist_wtd_convolve_x_dist_wtd_avg_6tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr,
+    const uint16_t fwd_offset, const uint16_t bck_offset) {
   assert(w % 4 == 0);
   assert(h % 4 == 0);
 
@@ -390,66 +413,49 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
   const int32x4_t round_offset_shim = vdupq_n_s32(
       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Stagger the filter for use with the matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter =
+      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
 
   if (w == 4) {
-    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter =
-        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
-
-    src_ptr += 2;
-
+    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
     do {
       uint8x16_t s0, s1, s2, s3;
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
       uint16x4_t d0 =
-          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d1 =
-          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d2 =
-          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim);
       uint16x4_t d3 =
-          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+          convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim);
 
       uint16x4_t dd0, dd1, dd2, dd3;
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
       uint8x8_t d01_u8, d23_u8;
       compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
                                bck_offset, round_offset_vec, &d01_u8, &d23_u8);
 
-      store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
-      store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+      store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01_u8);
+      store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23_u8);
 
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
     do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
+      const uint8_t *s = src;
+      uint16_t *d = dst;
+      uint8_t *d_u8 = dst8;
       int width = w;
 
       do {
@@ -457,13 +463,13 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
         uint16x8_t d0 =
-            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d1 =
-            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d2 =
-            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim);
         uint16x8_t d3 =
-            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+            convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim);
 
         uint16x8_t dd0, dd1, dd2, dd3;
         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -480,50 +486,75 @@ static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
         d_u8 += 8;
         width -= 8;
       } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
-static inline uint16x4_t convolve6_4_x(uint8x16_t samples,
-                                       const int8x16_t x_filter,
-                                       const uint8x16_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  // Permute samples ready for matrix multiply.
-  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+static inline void dist_wtd_convolve_x_dist_wtd_avg_8tap_neon_i8mm(
+    const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr,
+    const uint16_t fwd_offset, const uint16_t bck_offset) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
 
-  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
-  // (filter), destructively accumulating into the destination register.
-  int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter);
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
-  // We halved the convolution filter values so -1 from the right shift.
-  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-static inline uint16x8_t convolve6_8_x(uint8x16_t samples,
-                                       const int8x16_t x_filter,
-                                       const uint8x16x2_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  // Permute samples ready for matrix multiply.
-  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
-  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
-  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
-                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+  do {
+    const uint8_t *s = src;
+    uint16_t *d = dst;
+    uint8_t *d_u8 = dst8;
+    int width = w;
 
-  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
-  // (filter), destructively accumulating into the destination register.
-  int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter);
-  int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
-                               vshrn_n_s32(sum4567, ROUND0_BITS - 1));
-  return vreinterpretq_u16_s16(res);
+      uint16x8_t d0 =
+          convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d1 =
+          convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d2 =
+          convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x8_t d3 =
+          convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      uint16x8_t dd0, dd1, dd2, dd3;
+      load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+      compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                               bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                               &d2_u8, &d3_u8);
+
+      store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+      s += 8;
+      d += 8;
+      d_u8 += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    dst8 += 4 * dst8_stride;
+    h -= 4;
+  } while (h != 0);
 }
 
 static inline void dist_wtd_convolve_x_avg_6tap_neon_i8mm(
@@ -820,14 +851,23 @@ void av1_dist_wtd_convolve_x_neon_i8mm(
   const int filter_taps =
       get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
+  src -= (SUBPEL_TAPS / 2 - 1);
+
   if (conv_params->do_average) {
     if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-      dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
-          src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
-          subpel_x_qn, conv_params);
-    } else {
-      src -= (SUBPEL_TAPS / 2 - 1);
+      if (filter_taps < 8) {
+        dist_wtd_convolve_x_dist_wtd_avg_6tap_neon_i8mm(
+            src + 1, src_stride, conv_params->dst, conv_params->dst_stride,
+            dst8, dst8_stride, w, h, x_filter_ptr, conv_params->fwd_offset,
+            conv_params->bck_offset);
+        return;
+      }
 
+      dist_wtd_convolve_x_dist_wtd_avg_8tap_neon_i8mm(
+          src, src_stride, conv_params->dst, conv_params->dst_stride, dst8,
+          dst8_stride, w, h, x_filter_ptr, conv_params->fwd_offset,
+          conv_params->bck_offset);
+    } else {
       if (filter_taps < 8) {
         dist_wtd_convolve_x_avg_6tap_neon_i8mm(
             src + 1, src_stride, conv_params->dst, conv_params->dst_stride,
@@ -840,8 +880,6 @@ void av1_dist_wtd_convolve_x_neon_i8mm(
                                              dst8_stride, w, h, x_filter_ptr);
     }
   } else {
-    src -= (SUBPEL_TAPS / 2 - 1);
-
     if (filter_taps < 8) {
       dist_wtd_convolve_x_6tap_neon_i8mm(src + 1, src_stride, conv_params->dst,
                                          conv_params->dst_stride, w, h,
-- 
GitLab


From 7230cd840a6a7d168e047a101dd19b3ef05b490b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 13 Aug 2024 18:36:15 -0700
Subject: [PATCH 375/391] move av1_fdwt8x8_uint8_input_c to rtcd

This looks like it was the intent given the name. The function was added
in:
2dbdbc9de1 Add wavelet energy based q modulation

Other functions in dwt.c may also warrant adding to rtcd given the 8x8
block size.

Bug: aomedia:3416
Change-Id: I215d360c28d8b9f964c7cd73e0115bd0f61f3193
---
 av1/common/av1_rtcd_defs.pl | 1 +
 av1/encoder/dwt.c           | 2 +-
 av1/encoder/dwt.h           | 3 ---
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 1963751fab..94a5171080 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -254,6 +254,7 @@ specialize qw/av1_resize_and_extend_frame ssse3 neon/;
 # Encoder functions below this point.
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/void av1_fdwt8x8_uint8_input/, "const uint8_t *input, tran_low_t *output, int stride, int hbd";
 
   # ENCODEMB INVOKE
   add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
diff --git a/av1/encoder/dwt.c b/av1/encoder/dwt.c
index 84b3b7515a..f7c1778d61 100644
--- a/av1/encoder/dwt.c
+++ b/av1/encoder/dwt.c
@@ -128,7 +128,7 @@ static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
                                        int hbd) {
   tran_low_t output[64];
 
-  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  av1_fdwt8x8_uint8_input(input, output, stride, hbd);
   return haar_ac_sad(output, 8, 8, 8);
 }
 
diff --git a/av1/encoder/dwt.h b/av1/encoder/dwt.h
index 8ba6c02889..0ebbfc61bf 100644
--- a/av1/encoder/dwt.h
+++ b/av1/encoder/dwt.h
@@ -17,9 +17,6 @@
 
 #define DWT_MAX_LENGTH 64
 
-void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
-                               int stride, int hbd);
-
 int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
                                         int hbd, int num_8x8_rows,
                                         int num_8x8_cols);
-- 
GitLab


From 39f84da183f47495fb11cd1088bacbb0ea144bc6 Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Thu, 15 Aug 2024 12:01:42 -0700
Subject: [PATCH 376/391] cpu_used_firstpass_test.cc: Include gtest/gtest.h

Fix the ClangTidy misc-include-cleaner warning:
  no header providing "GTEST_SKIP" is directly included

Change-Id: I163abac834f21bb3aa18013418171b16200de412
---
 test/cpu_used_firstpass_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
index e1d389f238..7ed28e4603 100644
--- a/test/cpu_used_firstpass_test.cc
+++ b/test/cpu_used_firstpass_test.cc
@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
-- 
GitLab


From 8ae33b7d7bca9f60a049e8d6398fd32ed4ed7d75 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 11:41:49 -0700
Subject: [PATCH 377/391] cdef_block_avx2.c: make shuffle_reg_256bit[] static

And localize shuffle_reg_256bit[] to the function where it is used.

Bug: aomedia:3416
Change-Id: I5e8277f9201d49c6b79708e44efbe5852a6d86a7
---
 av1/common/x86/cdef_block_avx2.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/av1/common/x86/cdef_block_avx2.c b/av1/common/x86/cdef_block_avx2.c
index a74b39612c..4a8d249a3b 100644
--- a/av1/common/x86/cdef_block_avx2.c
+++ b/av1/common/x86/cdef_block_avx2.c
@@ -13,11 +13,6 @@
 #define SIMD_FUNC(name) name##_avx2
 #include "av1/common/cdef_block_simd.h"
 
-// Mask used to shuffle the elements present in 256bit register.
-const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
-                                    0x0f0e0100, 0x0b0a0d0c, 0x07060908,
-                                    0x03020504, 0x0f0e0100 };
-
 /* partial A is a 16-bit vector of the form:
 [x8 - - x1 | x16 - - x9] and partial B has the form:
 [0  y1 - y7 | 0 y9 - y15].
@@ -28,6 +23,10 @@ static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala,
                                             __m256i *partialb,
                                             const __m256i *const1,
                                             const __m256i *const2) {
+  // Mask used to shuffle the elements present in 256bit register.
+  static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
+                                             0x0f0e0100, 0x0b0a0d0c, 0x07060908,
+                                             0x03020504, 0x0f0e0100 };
   __m256i tmp;
   /* Reverse partial B. */
   *partialb = _mm256_shuffle_epi8(
-- 
GitLab


From a1ac379101f0471fb94daad1d7e792046b003464 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 11:45:36 -0700
Subject: [PATCH 378/391] pass2_strategy.c: make some tables static

And localize layer_fraction[] and smooth_filt[] to the only function
where they are used.

Bug: aomedia:3416
Change-Id: I9ff05bef2a648fdec401c1bec60f6cea4e29354d
---
 av1/encoder/pass2_strategy.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index c9a766b35a..beff033c65 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -894,13 +894,14 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
 }
 
 // Allocate bits to each frame in a GF / ARF group
-double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
-                                              0.60, 1.0,  1.0 };
 static void allocate_gf_group_bits(GF_GROUP *gf_group,
                                    PRIMARY_RATE_CONTROL *const p_rc,
                                    RATE_CONTROL *const rc,
                                    int64_t gf_group_bits, int gf_arf_bits,
                                    int key_frame, int use_arf) {
+  static const double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55,
+                                                             0.60, 0.60, 1.0,
+                                                             1.0 };
   int64_t total_group_bits = gf_group_bits;
   int base_frame_bits;
   const int gf_group_size = gf_group->size;
@@ -1085,15 +1086,16 @@ static int is_shorter_gf_interval_better(
 #define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
 #define WINDOW_SIZE 7
 #define HALF_WIN (WINDOW_SIZE / 2)
-// A 7-tap gaussian smooth filter
-const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
-                                              0.242, 0.061, 0.006 };
 
 // Smooth filter intra_error and coded_error in firstpass stats.
 // If stats[i].is_flash==1, the ith element should not be used in the filtering.
 static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
                                 int last_idx, double *filt_intra_err,
                                 double *filt_coded_err) {
+  // A 7-tap gaussian smooth filter
+  static const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242,
+                                                       0.383, 0.242, 0.061,
+                                                       0.006 };
   int i, j;
   for (i = start_idx; i <= last_idx; i++) {
     double total_wt = 0;
-- 
GitLab


From c7ff908e67abaa3d339e99532e3212705e546dbc Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 11:43:19 -0700
Subject: [PATCH 379/391] encodemb.c: make some globals static

Bug: aomedia:3416
Change-Id: Id709a09bab2d11f1a27811a18afa3e44a10e63b0
---
 av1/encoder/encodemb.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index a91506b043..a300f88d70 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -106,15 +106,16 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 // TODO(yjshen): These settings are tuned by experiments. They may still be
 // optimized for better performance.
 // (1) Coefficients which are large enough will ALWAYS be kept.
-const tran_low_t DROPOUT_COEFF_MAX = 2;  // Max dropout-able coefficient.
+static const tran_low_t DROPOUT_COEFF_MAX = 2;  // Max dropout-able coefficient.
 // (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
 //     NOT required. For example, `5 0 0 0 7` is treated as two continuous
 //     coefficients if three zeros do not fulfill the dropout condition.
-const int DROPOUT_CONTINUITY_MAX = 2;  // Max dropout-able continuous coeff.
+static const int DROPOUT_CONTINUITY_MAX =
+    2;  // Max dropout-able continuous coeff.
 // (3) Dropout operation is NOT applicable to blocks with large or small
 //     quantization index.
-const int DROPOUT_Q_MAX = 128;
-const int DROPOUT_Q_MIN = 16;
+static const int DROPOUT_Q_MAX = 128;
+static const int DROPOUT_Q_MIN = 16;
 // (4) Recall that dropout optimization will forcibly set some quantized
 //     coefficients to zero. The key logic on determining whether a coefficient
 //     should be dropped is to check the number of continuous zeros before AND
@@ -124,13 +125,20 @@ const int DROPOUT_Q_MIN = 16;
 //     the multiplier. Intuitively, larger block requires more zeros and larger
 //     quantization index also requires more zeros (more information is lost
 //     when using larger quantization index).
-const int DROPOUT_BEFORE_BASE_MAX = 32;  // Max base number for leading zeros.
-const int DROPOUT_BEFORE_BASE_MIN = 16;  // Min base number for leading zeros.
-const int DROPOUT_AFTER_BASE_MAX = 32;   // Max base number for trailing zeros.
-const int DROPOUT_AFTER_BASE_MIN = 16;   // Min base number for trailing zeros.
-const int DROPOUT_MULTIPLIER_MAX = 8;    // Max multiplier on number of zeros.
-const int DROPOUT_MULTIPLIER_MIN = 2;    // Min multiplier on number of zeros.
-const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
+static const int DROPOUT_BEFORE_BASE_MAX =
+    32;  // Max base number for leading zeros.
+static const int DROPOUT_BEFORE_BASE_MIN =
+    16;  // Min base number for leading zeros.
+static const int DROPOUT_AFTER_BASE_MAX =
+    32;  // Max base number for trailing zeros.
+static const int DROPOUT_AFTER_BASE_MIN =
+    16;  // Min base number for trailing zeros.
+static const int DROPOUT_MULTIPLIER_MAX =
+    8;  // Max multiplier on number of zeros.
+static const int DROPOUT_MULTIPLIER_MIN =
+    2;  // Min multiplier on number of zeros.
+static const int DROPOUT_MULTIPLIER_Q_BASE =
+    32;  // Base Q to compute multiplier.
 
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex) {
@@ -245,12 +253,12 @@ void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
 // TODO(yjshen): These settings are hard-coded and look okay for now. They
 // should be made configurable later.
 // Blocks of key frames ONLY.
-const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+static const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
 // Blocks of intra frames (key frames EXCLUSIVE).
-const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+static const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
 // Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
 // if trellis optimization is on for inter frames.)
-const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+static const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
 
 enum {
   QUANT_FUNC_LOWBD = 0,
-- 
GitLab


From 778f07bfd99326ce5c678e4edc50d9767fbc5931 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 11:44:10 -0700
Subject: [PATCH 380/391] intra_mode_search.c: make prune_intra_y_mode() static

This function is unused outside of this file.

Bug: aomedia:3416
Change-Id: Ie4d03efe230417fcd5fe485dfcdfdb90fb471509
---
 av1/encoder/intra_mode_search.c | 18 +++++++++++++++---
 av1/encoder/intra_mode_search.h | 16 ----------------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index f5810de9c6..c41920df60 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -432,9 +432,21 @@ static inline int get_model_rd_index_for_pruning(
   return model_rd_index_for_pruning;
 }
 
-int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
-                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
-                       int model_rd_index_for_pruning) {
+/*! \brief prune luma intra mode based on the model rd.
+ * \param[in]    this_model_rd              model rd for current mode.
+ * \param[in]    best_model_rd              Best model RD seen for this block so
+ *                                          far.
+ * \param[in]    top_intra_model_rd         Top intra model RD seen for this
+ *                                          block so far.
+ * \param[in]    max_model_cnt_allowed      The maximum number of top intra
+ *                                          model RD allowed.
+ * \param[in]    model_rd_index_for_pruning Index of the candidate used for
+ *                                          pruning based on model rd.
+ */
+static int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                              int64_t top_intra_model_rd[],
+                              int max_model_cnt_allowed,
+                              int model_rd_index_for_pruning) {
   const double thresh_best = 1.50;
   const double thresh_top = 1.00;
   for (int i = 0; i < max_model_cnt_allowed; i++) {
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index c03246a555..e24c0a6074 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -306,22 +306,6 @@ static inline void init_intra_mode_search_state(
  */
 void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
                                 int reorder_delta_angle_eval);
-
-/*! \brief prune luma intra mode based on the model rd.
- * \param[in]    this_model_rd              model rd for current mode.
- * \param[in]    best_model_rd              Best model RD seen for this block so
- *                                          far.
- * \param[in]    top_intra_model_rd         Top intra model RD seen for this
- *                                          block so far.
- * \param[in]    max_model_cnt_allowed      The maximum number of top intra
- *                                          model RD allowed.
- * \param[in]    model_rd_index_for_pruning Index of the candidate used for
- *                                          pruning based on model rd.
- */
-int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
-                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
-                       int model_rd_index_for_pruning);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-- 
GitLab


From 76316d4013d907a7f63a9b10048d0f58d4b47474 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 16:21:14 -0700
Subject: [PATCH 381/391] remove redundant `&& __GNUC__` preproc check

`#if defined(__GNUC__)` is enough if a specific version isn't being
looked for.

Bug: aomedia:356832974
Change-Id: I3fcbecf9d547c6a2d89d7b5456e83ee08ddc6f5e
---
 aom/aom_codec.h |  6 +++---
 aom_ports/mem.h |  2 +-
 aom_ports/x86.h | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index de22d7fb03..1e589ca574 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -102,7 +102,7 @@ extern "C" {
 
 /*!\brief Decorator indicating a function is deprecated */
 #ifndef AOM_DEPRECATED
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 #define AOM_DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
 #define AOM_DEPRECATED
@@ -112,7 +112,7 @@ extern "C" {
 #endif /* AOM_DEPRECATED */
 
 #ifndef AOM_DECLSPEC_DEPRECATED
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 #define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
 #elif defined(_MSC_VER)
 /*!\brief \copydoc #AOM_DEPRECATED */
@@ -132,7 +132,7 @@ extern "C" {
 
 /*!\brief Decorator indicating that given struct/union/enum is packed */
 #ifndef ATTRIBUTE_PACKED
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 #define ATTRIBUTE_PACKED __attribute__((packed))
 #elif defined(_MSC_VER)
 #define ATTRIBUTE_PACKED
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index fd33290330..df736a1846 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -15,7 +15,7 @@
 #include "aom/aom_integer.h"
 #include "config/aom_config.h"
 
-#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#if defined(__GNUC__) || defined(__SUNPRO_C)
 #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
 #elif defined(_MSC_VER)
 #define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index 1d2acefc59..3d27a2e83a 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -43,7 +43,7 @@ typedef enum {
   AOM_CPU_LAST
 } aom_cpu_t;
 
-#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
+#if defined(__GNUC__) || defined(__ANDROID__)
 #if AOM_ARCH_X86_64
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
@@ -249,7 +249,7 @@ static inline int x86_simd_caps(void) {
 // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
 // out-of-order leakage that can occur is minimal compared to total runtime.
 static inline unsigned int x86_readtsc(void) {
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
   unsigned int tsc;
   __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
   return tsc;
@@ -267,7 +267,7 @@ static inline unsigned int x86_readtsc(void) {
 }
 // 64-bit CPU cycle counter
 static inline uint64_t x86_readtsc64(void) {
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
   uint32_t hi, lo;
   __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
   return ((uint64_t)hi << 32) | lo;
@@ -286,7 +286,7 @@ static inline uint64_t x86_readtsc64(void) {
 
 // 32-bit CPU cycle counter with a partial fence against out-of-order execution.
 static inline unsigned int x86_readtscp(void) {
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
   unsigned int tscp;
   __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
   return tscp;
@@ -331,7 +331,7 @@ static inline unsigned int x86_tsc_end(void) {
   return v;
 }
 
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
 #define x86_pause_hint() asm volatile("pause \n\t")
@@ -343,7 +343,7 @@ static inline unsigned int x86_tsc_end(void) {
 #endif
 #endif
 
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 static void x87_set_control_word(unsigned short mode) {
   __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
 }
-- 
GitLab


From 0d5ebd416ed9ddc5291a6c2eeb745fa4758fa79b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 12:08:46 -0700
Subject: [PATCH 382/391] remove aom_convolve8() from rtcd

aom_convolve8_c() is a synonym for aom_scaled_2d_c() in aom_convolve.c;
there are no optimized versions of this function, just aom_scaled_2d().

Bug: aomedia:3416
Change-Id: I7088c24e778eeaca5e49f4f8fdae06e0c15d7fc4
---
 aom_dsp/aom_convolve.c       | 10 +---------
 aom_dsp/aom_dsp_rtcd_defs.pl |  1 -
 av1/common/resize.h          |  2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index e139ba1fb8..99594de7f4 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -111,7 +111,7 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                 w, h);
 }
 
-void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                      ptrdiff_t dst_stride, const InterpKernel *filter,
                      int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
                      int h) {
@@ -145,14 +145,6 @@ void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                 y0_q4, y_step_q4, w, h);
 }
 
-void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *filter,
-                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
-                     int h) {
-  aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
-                  y0_q4, y_step_q4, w, h);
-}
-
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, int w, int h) {
   for (int r = h; r > 0; --r) {
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 1a68f1b7a6..ff6be37591 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -492,7 +492,6 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
diff --git a/av1/common/resize.h b/av1/common/resize.h
index c8939bf786..489ad81f51 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -115,7 +115,7 @@ static inline int av1_superres_scaled(const AV1_COMMON *cm) {
 }
 
 // The optimized scaler av1_resize_and_extend_frame() can only handle scaling
-// ratios >= 1/4 and <= 16. See comment in aom_convolve8_c() for detail.
+// ratios >= 1/4 and <= 16. See comment in aom_scaled_2d_c() for detail.
 // Visual assessment shows that if the scaling ratio or its reciprocal is not a
 // multiple of 1/16, there are some artifacts in the output of the optimized
 // scaler, especially on lines, due to non-exact ratio representation. SSSE3
-- 
GitLab


From 93a76ca1b4f7cc2c1ef653fc06d5326c74e85682 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 12:48:26 -0700
Subject: [PATCH 383/391] yv12extend.c: remove 2 unused functions

- aom_extend_frame_inner_borders; the last (commented out) reference was
  removed in:
  aeee77c483 Multi-thread recon frame padding
- aom_extend_frame_borders_y; the last (commented out) reference was
  removed in:
  1c318d528f Remove some commented out code

Bug: aomedia:3416
Change-Id: I433efc796eaabc7154ae7366a54b00e9484e7930
---
 aom_scale/aom_scale_rtcd.pl    |  4 ----
 aom_scale/generic/yv12extend.c | 29 -----------------------------
 2 files changed, 33 deletions(-)

diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index 273e3f9cc1..bcdb8984d1 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl
@@ -39,8 +39,4 @@ add_proto qw/void aom_yv12_partial_coloc_copy_v/, "const struct yv12_buffer_conf
 add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end";
 
 add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, int num_planes";
-
-add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
-
-add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
 1;
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index e40bba320e..be5d854dbc 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c
@@ -222,35 +222,6 @@ void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
   extend_frame(ybf, ybf->border, num_planes);
 }
 
-void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                      const int num_planes) {
-  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
-                           ? AOMINNERBORDERINPIXELS
-                           : ybf->border;
-  extend_frame(ybf, inner_bw, num_planes);
-}
-
-void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
-  int ext_size = ybf->border;
-  assert(ybf->y_height - ybf->y_crop_height < 16);
-  assert(ybf->y_width - ybf->y_crop_width < 16);
-  assert(ybf->y_height - ybf->y_crop_height >= 0);
-  assert(ybf->y_width - ybf->y_crop_width >= 0);
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(
-        ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
-        ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
-        ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
-    return;
-  }
-#endif
-  extend_plane(
-      ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
-      ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
-      ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-- 
GitLab


From 132707ac7e262f4b67c7c725b872e4dc18bb728e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 14:03:59 -0700
Subject: [PATCH 384/391] remove masked_sad*x4d*()

These functions are unused outside of the tests.

Bug: aomedia:3416
Change-Id: I748c34d3ea9cfe944dcd48031ae092fe2b1a0680
---
 aom_dsp/aom_dsp.cmake            |   2 -
 aom_dsp/aom_dsp_rtcd_defs.pl     |  28 --
 aom_dsp/arm/masked_sad4d_neon.c  | 562 -------------------------------
 aom_dsp/sad_av1.c                |  15 -
 aom_dsp/x86/masked_sad4d_ssse3.c | 271 ---------------
 test/masked_sad_test.cc          | 115 -------
 6 files changed, 993 deletions(-)
 delete mode 100644 aom_dsp/arm/masked_sad4d_neon.c
 delete mode 100644 aom_dsp/x86/masked_sad4d_ssse3.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 46e6da3129..e5f9698bf0 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -263,7 +263,6 @@ if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
-              "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
@@ -283,7 +282,6 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/masked_sad4d_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ff6be37591..a7f74eee30 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1043,7 +1043,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
-    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
   specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
@@ -1119,33 +1118,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad8x32x3d         neon/;
   specialize qw/aom_sad4x16x3d         neon/;
 
-  specialize qw/aom_masked_sad128x128x4d  ssse3 neon/;
-  specialize qw/aom_masked_sad128x64x4d   ssse3 neon/;
-  specialize qw/aom_masked_sad64x128x4d   ssse3 neon/;
-  specialize qw/aom_masked_sad64x64x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad64x32x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad64x16x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad32x64x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad32x32x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad32x16x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad32x8x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad16x64x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad16x32x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad16x16x4d    ssse3 neon/;
-  specialize qw/aom_masked_sad16x8x4d     ssse3 neon/;
-
-  specialize qw/aom_masked_sad8x16x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad8x8x4d      ssse3 neon/;
-  specialize qw/aom_masked_sad8x4x4d      ssse3 neon/;
-  specialize qw/aom_masked_sad4x16x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad4x8x4d      ssse3 neon/;
-  specialize qw/aom_masked_sad4x4x4d      ssse3 neon/;
-
-  specialize qw/aom_masked_sad4x16x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad16x4x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad8x32x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad32x8x4d     ssse3 neon/;
-  specialize qw/aom_masked_sad64x16x4d    ssse3 neon/;
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
deleted file mode 100644
index f680c6faa0..0000000000
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/blend.h"
-#include "mem_neon.h"
-#include "sum_neon.h"
-
-static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
-                                              const uint8x16_t s0,
-                                              const uint8x16_t a0,
-                                              const uint8x16_t b0,
-                                              const uint8x16_t m0) {
-  uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
-  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
-  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
-  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
-
-  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
-  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
-  uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
-  return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
-}
-
-static inline void masked_inv_sadwxhx4d_large_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-  int h_limit = height > h_overflow ? h_overflow : height;
-
-  int ref_offset = 0;
-  int i = 0;
-  do {
-    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-
-    do {
-      int j = 0;
-      do {
-        uint8x16_t s0 = vld1q_u8(src + j);
-        uint8x16_t p0 = vld1q_u8(second_pred + j);
-        uint8x16_t m0 = vld1q_u8(mask + j);
-        sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
-                                         vld1q_u8(ref[0] + ref_offset + j), m0);
-        sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
-                                         vld1q_u8(ref[1] + ref_offset + j), m0);
-        sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
-                                         vld1q_u8(ref[2] + ref_offset + j), m0);
-        sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
-                                         vld1q_u8(ref[3] + ref_offset + j), m0);
-
-        uint8x16_t s1 = vld1q_u8(src + j + 16);
-        uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
-        uint8x16_t m1 = vld1q_u8(mask + j + 16);
-        sum_hi[0] = masked_sad_16x1_neon(
-            sum_hi[0], s1, p1, vld1q_u8(ref[0] + ref_offset + j + 16), m1);
-        sum_hi[1] = masked_sad_16x1_neon(
-            sum_hi[1], s1, p1, vld1q_u8(ref[1] + ref_offset + j + 16), m1);
-        sum_hi[2] = masked_sad_16x1_neon(
-            sum_hi[2], s1, p1, vld1q_u8(ref[2] + ref_offset + j + 16), m1);
-        sum_hi[3] = masked_sad_16x1_neon(
-            sum_hi[3], s1, p1, vld1q_u8(ref[3] + ref_offset + j + 16), m1);
-
-        j += 32;
-      } while (j < width);
-
-      src += src_stride;
-      ref_offset += ref_stride;
-      second_pred += width;
-      mask += mask_stride;
-    } while (++i < h_limit);
-
-    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
-    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
-    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
-    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
-    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
-    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
-    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
-    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
-
-    h_limit += h_overflow;
-  } while (i < height);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static inline void masked_inv_sad128xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
-                                  mask, mask_stride, res, 128, h, 32);
-}
-
-static inline void masked_inv_sad64xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
-                                  mask, mask_stride, res, 64, h, 64);
-}
-
-static inline void masked_sadwxhx4d_large_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-  int h_limit = height > h_overflow ? h_overflow : height;
-
-  int ref_offset = 0;
-  int i = 0;
-  do {
-    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                             vdupq_n_u16(0) };
-
-    do {
-      int j = 0;
-      do {
-        uint8x16_t s0 = vld1q_u8(src + j);
-        uint8x16_t p0 = vld1q_u8(second_pred + j);
-        uint8x16_t m0 = vld1q_u8(mask + j);
-        sum_lo[0] = masked_sad_16x1_neon(
-            sum_lo[0], s0, vld1q_u8(ref[0] + ref_offset + j), p0, m0);
-        sum_lo[1] = masked_sad_16x1_neon(
-            sum_lo[1], s0, vld1q_u8(ref[1] + ref_offset + j), p0, m0);
-        sum_lo[2] = masked_sad_16x1_neon(
-            sum_lo[2], s0, vld1q_u8(ref[2] + ref_offset + j), p0, m0);
-        sum_lo[3] = masked_sad_16x1_neon(
-            sum_lo[3], s0, vld1q_u8(ref[3] + ref_offset + j), p0, m0);
-
-        uint8x16_t s1 = vld1q_u8(src + j + 16);
-        uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
-        uint8x16_t m1 = vld1q_u8(mask + j + 16);
-        sum_hi[0] = masked_sad_16x1_neon(
-            sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + j + 16), p1, m1);
-        sum_hi[1] = masked_sad_16x1_neon(
-            sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + j + 16), p1, m1);
-        sum_hi[2] = masked_sad_16x1_neon(
-            sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + j + 16), p1, m1);
-        sum_hi[3] = masked_sad_16x1_neon(
-            sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + j + 16), p1, m1);
-
-        j += 32;
-      } while (j < width);
-
-      src += src_stride;
-      ref_offset += ref_stride;
-      second_pred += width;
-      mask += mask_stride;
-    } while (++i < h_limit);
-
-    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
-    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
-    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
-    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
-    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
-    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
-    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
-    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
-
-    h_limit += h_overflow;
-  } while (i < height);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static inline void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
-                                           const uint8_t *const ref[4],
-                                           int ref_stride,
-                                           const uint8_t *second_pred,
-                                           const uint8_t *mask, int mask_stride,
-                                           uint32_t res[4], int h) {
-  masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
-                              mask, mask_stride, res, 128, h, 32);
-}
-
-static inline void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
-                                          const uint8_t *const ref[4],
-                                          int ref_stride,
-                                          const uint8_t *second_pred,
-                                          const uint8_t *mask, int mask_stride,
-                                          uint32_t res[4], int h) {
-  masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
-                              mask, mask_stride, res, 64, h, 64);
-}
-
-static inline void masked_inv_sad32xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                           vdupq_n_u16(0) };
-  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                           vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x16_t s0 = vld1q_u8(src);
-    uint8x16_t p0 = vld1q_u8(second_pred);
-    uint8x16_t m0 = vld1q_u8(mask);
-    sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
-                                     vld1q_u8(ref[0] + ref_offset), m0);
-    sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
-                                     vld1q_u8(ref[1] + ref_offset), m0);
-    sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
-                                     vld1q_u8(ref[2] + ref_offset), m0);
-    sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
-                                     vld1q_u8(ref[3] + ref_offset), m0);
-
-    uint8x16_t s1 = vld1q_u8(src + 16);
-    uint8x16_t p1 = vld1q_u8(second_pred + 16);
-    uint8x16_t m1 = vld1q_u8(mask + 16);
-    sum_hi[0] = masked_sad_16x1_neon(sum_hi[0], s1, p1,
-                                     vld1q_u8(ref[0] + ref_offset + 16), m1);
-    sum_hi[1] = masked_sad_16x1_neon(sum_hi[1], s1, p1,
-                                     vld1q_u8(ref[1] + ref_offset + 16), m1);
-    sum_hi[2] = masked_sad_16x1_neon(sum_hi[2], s1, p1,
-                                     vld1q_u8(ref[2] + ref_offset + 16), m1);
-    sum_hi[3] = masked_sad_16x1_neon(sum_hi[3], s1, p1,
-                                     vld1q_u8(ref[3] + ref_offset + 16), m1);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 32;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
-}
-
-static inline void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
-                                          const uint8_t *const ref[4],
-                                          int ref_stride,
-                                          const uint8_t *second_pred,
-                                          const uint8_t *mask, int mask_stride,
-                                          uint32_t res[4], int h) {
-  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                           vdupq_n_u16(0) };
-  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                           vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x16_t s0 = vld1q_u8(src);
-    uint8x16_t p0 = vld1q_u8(second_pred);
-    uint8x16_t m0 = vld1q_u8(mask);
-    sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0,
-                                     vld1q_u8(ref[0] + ref_offset), p0, m0);
-    sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0,
-                                     vld1q_u8(ref[1] + ref_offset), p0, m0);
-    sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0,
-                                     vld1q_u8(ref[2] + ref_offset), p0, m0);
-    sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0,
-                                     vld1q_u8(ref[3] + ref_offset), p0, m0);
-
-    uint8x16_t s1 = vld1q_u8(src + 16);
-    uint8x16_t p1 = vld1q_u8(second_pred + 16);
-    uint8x16_t m1 = vld1q_u8(mask + 16);
-    sum_hi[0] = masked_sad_16x1_neon(
-        sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + 16), p1, m1);
-    sum_hi[1] = masked_sad_16x1_neon(
-        sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + 16), p1, m1);
-    sum_hi[2] = masked_sad_16x1_neon(
-        sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + 16), p1, m1);
-    sum_hi[3] = masked_sad_16x1_neon(
-        sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + 16), p1, m1);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 32;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
-}
-
-static inline void masked_inv_sad16xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                            vdupq_n_u16(0) };
-  uint32x4_t sum_u32[4];
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x16_t s0 = vld1q_u8(src);
-    uint8x16_t p0 = vld1q_u8(second_pred);
-    uint8x16_t m0 = vld1q_u8(mask);
-    sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, p0,
-                                      vld1q_u8(ref[0] + ref_offset), m0);
-    sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, p0,
-                                      vld1q_u8(ref[1] + ref_offset), m0);
-    sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, p0,
-                                      vld1q_u8(ref[2] + ref_offset), m0);
-    sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, p0,
-                                      vld1q_u8(ref[3] + ref_offset), m0);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 16;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
-  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
-  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
-  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
-}
-
-static inline void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
-                                          const uint8_t *const ref[4],
-                                          int ref_stride,
-                                          const uint8_t *second_pred,
-                                          const uint8_t *mask, int mask_stride,
-                                          uint32_t res[4], int h) {
-  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                            vdupq_n_u16(0) };
-  uint32x4_t sum_u32[4];
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x16_t s0 = vld1q_u8(src);
-    uint8x16_t p0 = vld1q_u8(second_pred);
-    uint8x16_t m0 = vld1q_u8(mask);
-    sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0,
-                                      vld1q_u8(ref[0] + ref_offset), p0, m0);
-    sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0,
-                                      vld1q_u8(ref[1] + ref_offset), p0, m0);
-    sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0,
-                                      vld1q_u8(ref[2] + ref_offset), p0, m0);
-    sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0,
-                                      vld1q_u8(ref[3] + ref_offset), p0, m0);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 16;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
-  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
-  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
-  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
-}
-
-static inline uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
-                                             const uint8x8_t a0,
-                                             const uint8x8_t b0,
-                                             const uint8x8_t m0) {
-  uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-  uint16x8_t blend_u16 = vmull_u8(m0, a0);
-  blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
-
-  uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
-  return vabal_u8(sad, blend_u8, s0);
-}
-
-static inline void masked_inv_sad8xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x8_t s0 = vld1_u8(src);
-    uint8x8_t p0 = vld1_u8(second_pred);
-    uint8x8_t m0 = vld1_u8(mask);
-    sum[0] =
-        masked_sad_8x1_neon(sum[0], s0, p0, vld1_u8(ref[0] + ref_offset), m0);
-    sum[1] =
-        masked_sad_8x1_neon(sum[1], s0, p0, vld1_u8(ref[1] + ref_offset), m0);
-    sum[2] =
-        masked_sad_8x1_neon(sum[2], s0, p0, vld1_u8(ref[2] + ref_offset), m0);
-    sum[3] =
-        masked_sad_8x1_neon(sum[3], s0, p0, vld1_u8(ref[3] + ref_offset), m0);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 8;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
-}
-
-static inline void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
-                                         const uint8_t *const ref[4],
-                                         int ref_stride,
-                                         const uint8_t *second_pred,
-                                         const uint8_t *mask, int mask_stride,
-                                         uint32_t res[4], int h) {
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    uint8x8_t s0 = vld1_u8(src);
-    uint8x8_t p0 = vld1_u8(second_pred);
-    uint8x8_t m0 = vld1_u8(mask);
-
-    sum[0] =
-        masked_sad_8x1_neon(sum[0], s0, vld1_u8(ref[0] + ref_offset), p0, m0);
-    sum[1] =
-        masked_sad_8x1_neon(sum[1], s0, vld1_u8(ref[1] + ref_offset), p0, m0);
-    sum[2] =
-        masked_sad_8x1_neon(sum[2], s0, vld1_u8(ref[2] + ref_offset), p0, m0);
-    sum[3] =
-        masked_sad_8x1_neon(sum[3], s0, vld1_u8(ref[3] + ref_offset), p0, m0);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-    second_pred += 8;
-    mask += mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
-}
-
-static inline void masked_inv_sad4xhx4d_neon(
-    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
-    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, uint32_t res[4], int h) {
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h / 2;
-  do {
-    uint8x8_t s = load_unaligned_u8(src, src_stride);
-    uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
-    uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
-    uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
-    uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
-    uint8x8_t p0 = vld1_u8(second_pred);
-    uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
-
-    sum[0] = masked_sad_8x1_neon(sum[0], s, p0, r0, m0);
-    sum[1] = masked_sad_8x1_neon(sum[1], s, p0, r1, m0);
-    sum[2] = masked_sad_8x1_neon(sum[2], s, p0, r2, m0);
-    sum[3] = masked_sad_8x1_neon(sum[3], s, p0, r3, m0);
-
-    src += 2 * src_stride;
-    ref_offset += 2 * ref_stride;
-    second_pred += 2 * 4;
-    mask += 2 * mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
-}
-
-static inline void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
-                                         const uint8_t *const ref[4],
-                                         int ref_stride,
-                                         const uint8_t *second_pred,
-                                         const uint8_t *mask, int mask_stride,
-                                         uint32_t res[4], int h) {
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  int ref_offset = 0;
-  int i = h / 2;
-  do {
-    uint8x8_t s = load_unaligned_u8(src, src_stride);
-    uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
-    uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
-    uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
-    uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
-    uint8x8_t p0 = vld1_u8(second_pred);
-    uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
-
-    sum[0] = masked_sad_8x1_neon(sum[0], s, r0, p0, m0);
-    sum[1] = masked_sad_8x1_neon(sum[1], s, r1, p0, m0);
-    sum[2] = masked_sad_8x1_neon(sum[2], s, r2, p0, m0);
-    sum[3] = masked_sad_8x1_neon(sum[3], s, r3, p0, m0);
-
-    src += 2 * src_stride;
-    ref_offset += 2 * ref_stride;
-    second_pred += 2 * 4;
-    mask += 2 * mask_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
-}
-
-#define MASKED_SAD4D_WXH_NEON(w, h)                                            \
-  void aom_masked_sad##w##x##h##x4d_neon(                                      \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int invert_mask, uint32_t res[4]) {                      \
-    if (invert_mask) {                                                         \
-      masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,          \
-                                    second_pred, msk, msk_stride, res, h);     \
-    } else {                                                                   \
-      masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
-                                msk, msk_stride, res, h);                      \
-    }                                                                          \
-  }
-
-MASKED_SAD4D_WXH_NEON(4, 8)
-MASKED_SAD4D_WXH_NEON(4, 4)
-
-MASKED_SAD4D_WXH_NEON(8, 16)
-MASKED_SAD4D_WXH_NEON(8, 8)
-MASKED_SAD4D_WXH_NEON(8, 4)
-
-MASKED_SAD4D_WXH_NEON(16, 32)
-MASKED_SAD4D_WXH_NEON(16, 16)
-MASKED_SAD4D_WXH_NEON(16, 8)
-
-MASKED_SAD4D_WXH_NEON(32, 64)
-MASKED_SAD4D_WXH_NEON(32, 32)
-MASKED_SAD4D_WXH_NEON(32, 16)
-
-MASKED_SAD4D_WXH_NEON(64, 128)
-MASKED_SAD4D_WXH_NEON(64, 64)
-MASKED_SAD4D_WXH_NEON(64, 32)
-
-MASKED_SAD4D_WXH_NEON(128, 128)
-MASKED_SAD4D_WXH_NEON(128, 64)
-
-#if !CONFIG_REALTIME_ONLY
-MASKED_SAD4D_WXH_NEON(4, 16)
-MASKED_SAD4D_WXH_NEON(16, 4)
-MASKED_SAD4D_WXH_NEON(8, 32)
-MASKED_SAD4D_WXH_NEON(32, 8)
-MASKED_SAD4D_WXH_NEON(16, 64)
-MASKED_SAD4D_WXH_NEON(64, 16)
-#endif
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index f016f56a9f..c16a237355 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -49,21 +49,6 @@ static inline unsigned int masked_sad(const uint8_t *src, int src_stride,
     else                                                                       \
       return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
                         msk_stride, m, n);                                     \
-  }                                                                            \
-  void aom_masked_sad##m##x##n##x4d_c(                                         \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int invert_mask, unsigned sads[4]) {                     \
-    if (!invert_mask)                                                          \
-      for (int i = 0; i < 4; i++) {                                            \
-        sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
-                             m, msk, msk_stride, m, n);                        \
-      }                                                                        \
-    else                                                                       \
-      for (int i = 0; i < 4; i++) {                                            \
-        sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i],          \
-                             ref_stride, msk, msk_stride, m, n);               \
-      }                                                                        \
   }
 
 /* clang-format off */
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
deleted file mode 100644
index 2e69f957d5..0000000000
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
-
-#define MASK_SAD16XH_ONE_REF(idx)                             \
-  a = _mm_loadu_si128((const __m128i *)&ref##idx[x]);         \
-  data_l = _mm_unpacklo_epi8(a, b);                           \
-  mask_l = _mm_unpacklo_epi8(m, m_inv);                       \
-  pred_l = _mm_maddubs_epi16(data_l, mask_l);                 \
-  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
-                                                              \
-  data_r = _mm_unpackhi_epi8(a, b);                           \
-  mask_r = _mm_unpackhi_epi8(m, m_inv);                       \
-  pred_r = _mm_maddubs_epi16(data_r, mask_r);                 \
-  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
-                                                              \
-  pred = _mm_packus_epi16(pred_l, pred_r);                    \
-  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
-
-static inline void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *a_ptr[4], int a_stride,
-                                       const uint8_t *b_ptr, int b_stride,
-                                       const uint8_t *m_ptr, int m_stride,
-                                       int width, int height, int inv_mask,
-                                       unsigned sad_array[4]) {
-  int x, y;
-  __m128i a;
-  __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  __m128i res0 = _mm_setzero_si128();
-  __m128i res1 = _mm_setzero_si128();
-  __m128i res2 = _mm_setzero_si128();
-  __m128i res3 = _mm_setzero_si128();
-  const uint8_t *ref0 = a_ptr[0];
-  const uint8_t *ref1 = a_ptr[1];
-  const uint8_t *ref2 = a_ptr[2];
-  const uint8_t *ref3 = a_ptr[3];
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
-      __m128i m = inv_mask ? m_inv : m_copy;
-      m_inv = inv_mask ? m_copy : m_inv;
-
-      MASK_SAD16XH_ONE_REF(0)
-      MASK_SAD16XH_ONE_REF(1)
-      MASK_SAD16XH_ONE_REF(2)
-      MASK_SAD16XH_ONE_REF(3)
-    }
-
-    src_ptr += src_stride;
-    ref0 += a_stride;
-    ref1 += a_stride;
-    ref2 += a_stride;
-    ref3 += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
-                       _mm_unpackhi_epi32(res0, res1));
-  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
-                       _mm_unpackhi_epi32(res2, res3));
-
-  res0 = _mm_unpacklo_epi64(res0, res2);
-  _mm_storeu_si128((__m128i *)sad_array, res0);
-}
-
-#define MASK_SAD8XH_ONE_REF(idx)                                               \
-  const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx);              \
-  const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
-  data_l = _mm_unpacklo_epi8(a##idx##0, b0);                                   \
-  mask_l = _mm_unpacklo_epi8(m, m_inv);                                        \
-  pred_l = _mm_maddubs_epi16(data_l, mask_l);                                  \
-  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);                  \
-                                                                               \
-  data_r = _mm_unpacklo_epi8(a##idx##1, b1);                                   \
-  mask_r = _mm_unpackhi_epi8(m, m_inv);                                        \
-  pred_r = _mm_maddubs_epi16(data_r, mask_r);                                  \
-  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);                  \
-                                                                               \
-  pred = _mm_packus_epi16(pred_l, pred_r);                                     \
-  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
-
-static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_array[4], int a_stride,
-                                   const uint8_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int height, int inv_mask,
-                                   unsigned sad_array[4]) {
-  const uint8_t *ref0 = ref_array[0];
-  const uint8_t *ref1 = ref_array[1];
-  const uint8_t *ref2 = ref_array[2];
-  const uint8_t *ref3 = ref_array[3];
-  __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
-  __m128i res0 = _mm_setzero_si128();
-  __m128i res1 = _mm_setzero_si128();
-  __m128i res2 = _mm_setzero_si128();
-  __m128i res3 = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (int y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
-    const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
-    const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
-    const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
-    const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
-    __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
-    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
-    __m128i m = inv_mask ? m_inv : m_copy;
-    m_inv = inv_mask ? m_copy : m_inv;
-
-    MASK_SAD8XH_ONE_REF(0)
-    MASK_SAD8XH_ONE_REF(1)
-    MASK_SAD8XH_ONE_REF(2)
-    MASK_SAD8XH_ONE_REF(3)
-
-    ref0 += 2 * a_stride;
-    ref1 += 2 * a_stride;
-    ref2 += 2 * a_stride;
-    ref3 += 2 * a_stride;
-    src_ptr += 2 * src_stride;
-    b_ptr += 2 * b_stride;
-    m_ptr += 2 * m_stride;
-  }
-  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
-                       _mm_unpackhi_epi32(res0, res1));
-  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
-                       _mm_unpackhi_epi32(res2, res3));
-  res0 = _mm_unpacklo_epi64(res0, res2);
-  _mm_storeu_si128((__m128i *)sad_array, res0);
-}
-
-#define MASK_SAD4XH_ONE_REF(idx)                                          \
-  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx),             \
-                         _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
-  data = _mm_unpacklo_epi8(a, b);                                         \
-  mask = _mm_unpacklo_epi8(m, m_inv);                                     \
-  pred = _mm_maddubs_epi16(data, mask);                                   \
-  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                 \
-                                                                          \
-  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
-  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
-
-static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_array[4], int a_stride,
-                                   const uint8_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int height, int inv_mask,
-                                   unsigned sad_array[4]) {
-  const uint8_t *ref0 = ref_array[0];
-  const uint8_t *ref1 = ref_array[1];
-  const uint8_t *ref2 = ref_array[2];
-  const uint8_t *ref3 = ref_array[3];
-  __m128i data, pred, mask;
-  __m128i res0 = _mm_setzero_si128();
-  __m128i res1 = _mm_setzero_si128();
-  __m128i res2 = _mm_setzero_si128();
-  __m128i res3 = _mm_setzero_si128();
-  __m128i a;
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (int y = 0; y < height; y += 2) {
-    const __m128i src =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
-                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
-    const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
-                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
-    const __m128i m_copy =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
-                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
-
-    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
-    __m128i m = inv_mask ? m_inv : m_copy;
-    m_inv = inv_mask ? m_copy : m_inv;
-
-    MASK_SAD4XH_ONE_REF(0)
-    MASK_SAD4XH_ONE_REF(1)
-    MASK_SAD4XH_ONE_REF(2)
-    MASK_SAD4XH_ONE_REF(3)
-
-    ref0 += 2 * a_stride;
-    ref1 += 2 * a_stride;
-    ref2 += 2 * a_stride;
-    ref3 += 2 * a_stride;
-    src_ptr += 2 * src_stride;
-    b_ptr += 2 * b_stride;
-    m_ptr += 2 * m_stride;
-  }
-  res0 = _mm_unpacklo_epi32(res0, res1);
-  res2 = _mm_unpacklo_epi32(res2, res3);
-  res0 = _mm_unpacklo_epi64(res0, res2);
-  _mm_storeu_si128((__m128i *)sad_array, res0);
-}
-
-#define MASKSADMXN_SSSE3(m, n)                                                 \
-  void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                   \
-    masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
-                        msk_stride, m, n, inv_mask, sad_array);                \
-  }
-
-#define MASKSAD8XN_SSSE3(n)                                                  \
-  void aom_masked_sad8x##n##x4d_ssse3(                                       \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
-    masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \
-                           msk, msk_stride, n, inv_mask, sad_array);         \
-  }
-
-#define MASKSAD4XN_SSSE3(n)                                                  \
-  void aom_masked_sad4x##n##x4d_ssse3(                                       \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
-    masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \
-                           msk, msk_stride, n, inv_mask, sad_array);         \
-  }
-
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-
-#if !CONFIG_REALTIME_ONLY
-MASKSAD4XN_SSSE3(16)
-MASKSADMXN_SSSE3(16, 4)
-MASKSAD8XN_SSSE3(32)
-MASKSADMXN_SSSE3(32, 8)
-MASKSADMXN_SSSE3(16, 64)
-MASKSADMXN_SSSE3(64, 16)
-#endif  // !CONFIG_REALTIME_ONLY
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 8f825c6429..1b9e95386f 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -85,29 +85,6 @@ class MaskedSADTest : public MaskedSADTestBase,
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
 
-class MaskedSADx4Test : public MaskedSADTestBase,
-                        public ::testing::WithParamInterface<MaskedSADx4Param> {
- public:
-  ~MaskedSADx4Test() override = default;
-  void SetUp() override {
-    maskedSAD_op_ = GET_PARAM(0);
-    ref_maskedSAD_op_ = GET_PARAM(1);
-  }
-  void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
-              int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
-              int msk_stride, int inv_mask, unsigned sads[],
-              int times) override;
-  void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
-               int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
-               int msk_stride, int inv_mask, unsigned sads[],
-               int times) override;
-
- protected:
-  MaskedSADx4Func maskedSAD_op_;
-  MaskedSADx4Func ref_maskedSAD_op_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADx4Test);
-
 void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
                            const uint8_t *ref_ptr[], int ref_stride,
                            const uint8_t *second_pred, const uint8_t *msk,
@@ -136,34 +113,6 @@ void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
   }
 }
 
-void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr[], int ref_stride,
-                             const uint8_t *second_pred, const uint8_t *msk,
-                             int msk_stride, int invert_mask, unsigned sads[],
-                             int times) {
-  for (int repeat = 0; repeat < times; ++repeat) {
-    ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred,
-                      msk, msk_stride, invert_mask, sads);
-  }
-}
-
-void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr[], int ref_stride,
-                              const uint8_t *second_pred, const uint8_t *msk,
-                              int msk_stride, int invert_mask, unsigned sads[],
-                              int times) {
-  if (times == 1) {
-    API_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
-                                           ref_stride, second_pred, msk,
-                                           msk_stride, invert_mask, sads));
-  } else {
-    for (int repeat = 0; repeat < times; ++repeat) {
-      maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk,
-                    msk_stride, invert_mask, sads);
-    }
-  }
-}
-
 void MaskedSADTestBase::runMaskedSADTest(int run_times) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE;
@@ -245,10 +194,6 @@ TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
 
 TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
 
-TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); }
-
-TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); }
-
 #if CONFIG_AV1_HIGHBITDEPTH
 typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
@@ -376,36 +321,6 @@ const MaskedSADParam msad_test[] = {
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
 
-const MaskedSADx4Param msadx4_test[] = {
-  make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c),
-  make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c),
-  make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c),
-  make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c),
-  make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c),
-  make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c),
-  make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c),
-  make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c),
-  make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c),
-  make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c),
-  make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c),
-  make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c),
-  make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c),
-  make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
-  make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
-  make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
-#if !CONFIG_REALTIME_ONLY
-  make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
-  make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
-  make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
-  make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
-  make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
-  make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
-                         ::testing::ValuesIn(msadx4_test));
-
 #if CONFIG_AV1_HIGHBITDEPTH
 const HighbdMaskedSADParam hbd_msad_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
@@ -548,36 +463,6 @@ const MaskedSADParam msad_test[] = {
 
 INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADTest, ::testing::ValuesIn(msad_test));
 
-const MaskedSADx4Param msadx4_test[] = {
-  make_tuple(&aom_masked_sad4x4x4d_neon, &aom_masked_sad4x4x4d_c),
-  make_tuple(&aom_masked_sad4x8x4d_neon, &aom_masked_sad4x8x4d_c),
-  make_tuple(&aom_masked_sad8x4x4d_neon, &aom_masked_sad8x4x4d_c),
-  make_tuple(&aom_masked_sad8x8x4d_neon, &aom_masked_sad8x8x4d_c),
-  make_tuple(&aom_masked_sad8x16x4d_neon, &aom_masked_sad8x16x4d_c),
-  make_tuple(&aom_masked_sad16x8x4d_neon, &aom_masked_sad16x8x4d_c),
-  make_tuple(&aom_masked_sad16x16x4d_neon, &aom_masked_sad16x16x4d_c),
-  make_tuple(&aom_masked_sad16x32x4d_neon, &aom_masked_sad16x32x4d_c),
-  make_tuple(&aom_masked_sad32x16x4d_neon, &aom_masked_sad32x16x4d_c),
-  make_tuple(&aom_masked_sad32x32x4d_neon, &aom_masked_sad32x32x4d_c),
-  make_tuple(&aom_masked_sad32x64x4d_neon, &aom_masked_sad32x64x4d_c),
-  make_tuple(&aom_masked_sad64x32x4d_neon, &aom_masked_sad64x32x4d_c),
-  make_tuple(&aom_masked_sad64x64x4d_neon, &aom_masked_sad64x64x4d_c),
-  make_tuple(&aom_masked_sad64x128x4d_neon, &aom_masked_sad64x128x4d_c),
-  make_tuple(&aom_masked_sad128x64x4d_neon, &aom_masked_sad128x64x4d_c),
-  make_tuple(&aom_masked_sad128x128x4d_neon, &aom_masked_sad128x128x4d_c),
-#if !CONFIG_REALTIME_ONLY
-  make_tuple(&aom_masked_sad4x16x4d_neon, &aom_masked_sad4x16x4d_c),
-  make_tuple(&aom_masked_sad16x4x4d_neon, &aom_masked_sad16x4x4d_c),
-  make_tuple(&aom_masked_sad8x32x4d_neon, &aom_masked_sad8x32x4d_c),
-  make_tuple(&aom_masked_sad32x8x4d_neon, &aom_masked_sad32x8x4d_c),
-  make_tuple(&aom_masked_sad16x64x4d_neon, &aom_masked_sad16x64x4d_c),
-  make_tuple(&aom_masked_sad64x16x4d_neon, &aom_masked_sad64x16x4d_c),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADx4Test,
-                         ::testing::ValuesIn(msadx4_test));
-
 #if CONFIG_AV1_HIGHBITDEPTH
 const MaskedSADParam hbd_msad_neon_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_neon, &aom_highbd_masked_sad4x4_c),
-- 
GitLab


From 9c7f28d2a6c01017bbbb55377229f40f6b533270 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 13:34:18 -0700
Subject: [PATCH 385/391] binary_codes_reader.c: make 2 functions static

aom_read_primitive_quniform_() and aom_read_primitive_subexpfin() are
only used in this file.

Bug: aomedia:3416
Change-Id: I058e97c082117817b2aa908f5d0a7cd73c78e0bd
---
 aom_dsp/binary_codes_reader.c | 17 +++++++++++------
 aom_dsp/binary_codes_reader.h |  7 -------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/aom_dsp/binary_codes_reader.c b/aom_dsp/binary_codes_reader.c
index 3af1c18642..a87653278f 100644
--- a/aom_dsp/binary_codes_reader.c
+++ b/aom_dsp/binary_codes_reader.c
@@ -12,8 +12,13 @@
 #include "aom_dsp/binary_codes_reader.h"
 #include "aom_dsp/recenter.h"
 
-uint16_t aom_read_primitive_quniform_(aom_reader *r,
-                                      uint16_t n ACCT_STR_PARAM) {
+#define read_primitive_quniform(r, n, ACCT_STR_NAME) \
+  read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
+#define read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
+  read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
+
+static uint16_t read_primitive_quniform_(aom_reader *r,
+                                         uint16_t n ACCT_STR_PARAM) {
   if (n <= 1) return 0;
   const int l = get_msb(n) + 1;
   const int m = (1 << l) - n;
@@ -23,8 +28,8 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r,
 
 // Decode finite subexponential code that for a symbol v in [0, n-1] with
 // parameter k
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
-                                       uint16_t k ACCT_STR_PARAM) {
+static uint16_t read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                          uint16_t k ACCT_STR_PARAM) {
   int i = 0;
   int mk = 0;
 
@@ -33,7 +38,7 @@ uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
     int a = (1 << b);
 
     if (n <= mk + 3 * a) {
-      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+      return read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
     }
 
     if (!aom_read_bit(r, ACCT_STR_NAME)) {
@@ -51,5 +56,5 @@ uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
                                           uint16_t ref ACCT_STR_PARAM) {
   return inv_recenter_finite_nonneg(
-      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
+      n, ref, read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
 }
diff --git a/aom_dsp/binary_codes_reader.h b/aom_dsp/binary_codes_reader.h
index 8ef16eb11c..ad529a3214 100644
--- a/aom_dsp/binary_codes_reader.h
+++ b/aom_dsp/binary_codes_reader.h
@@ -24,16 +24,9 @@ extern "C" {
 #include "aom_dsp/bitreader.h"
 #include "aom_dsp/bitreader_buffer.h"
 
-#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
-  aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
-  aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
   aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
 
-uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
-                                       uint16_t k ACCT_STR_PARAM);
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
                                           uint16_t ref ACCT_STR_PARAM);
 
-- 
GitLab


From 02f67ff33995dfc47b5f6a6d4618d042ea1aa6f2 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 12:13:53 -0700
Subject: [PATCH 386/391] binary_codes_writer.c: make some functions static

aom_count_primitive_quniform() & aom_count_primitive_subexpfin().

+ delete unused aom_count_primitive_symmetric(); this was added, but
  never referenced in or after:
  47748b5692 Adds binary code lib for coding various symbols

Bug: aomedia:3416
Change-Id: I44d17c5a0d71e4d3f74a69fbb89f22adc2637338
---
 aom_dsp/binary_codes_writer.c | 12 ++++--------
 aom_dsp/binary_codes_writer.h |  3 ---
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/aom_dsp/binary_codes_writer.c b/aom_dsp/binary_codes_writer.c
index c7722283b1..169e80e40e 100644
--- a/aom_dsp/binary_codes_writer.c
+++ b/aom_dsp/binary_codes_writer.c
@@ -32,10 +32,6 @@ void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
   }
 }
 
-int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
-  return (v == 0 ? 1 : abs_bits + 2);
-}
-
 // Encodes a value v in [0, n-1] quasi-uniformly
 void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
   if (n <= 1) return;
@@ -49,7 +45,7 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
   }
 }
 
-int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+static int count_primitive_quniform(uint16_t n, uint16_t v) {
   if (n <= 1) return 0;
   const int l = get_msb(n) + 1;
   const int m = (1 << l) - n;
@@ -81,7 +77,7 @@ void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
   }
 }
 
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+static int count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
   int count = 0;
   int i = 0;
   int mk = 0;
@@ -89,7 +85,7 @@ int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
     if (n <= mk + 3 * a) {
-      count += aom_count_primitive_quniform(n - mk, v - mk);
+      count += count_primitive_quniform(n - mk, v - mk);
       break;
     } else {
       int t = (v >= mk + a);
@@ -125,7 +121,7 @@ void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
 
 int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
                                      uint16_t v) {
-  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+  return count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
 }
 
 int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
diff --git a/aom_dsp/binary_codes_writer.h b/aom_dsp/binary_codes_writer.h
index 99524d2df2..a740f99cc3 100644
--- a/aom_dsp/binary_codes_writer.h
+++ b/aom_dsp/binary_codes_writer.h
@@ -50,9 +50,6 @@ void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
                                              int16_t v);
 
 // Functions that counts bits for the above primitives
-int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
-int aom_count_primitive_quniform(uint16_t n, uint16_t v);
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
 int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
                                      uint16_t v);
 int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
-- 
GitLab


From 68bc71348beb562d1a83b18d36ae875bc45a585e Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 15 Aug 2024 12:30:36 -0700
Subject: [PATCH 387/391] remove aom_dist_wtd_comp_avg_upsampled_pred*()

This function was renamed from aom_jnt_comp_avg_upsampled_pred() in:
  0c96c11e58 Rename jnt_comp convolves to dist_wtd_comp
but had been unused since:
  c05147803f use_jnt_comp_avg should be 0 in motion search

Bug: aomedia:3416
Change-Id: I5cc92a2165bea54a1354004bd213c56c2ab95edf
---
 av1/av1.cmake                          |   9 --
 av1/common/av1_rtcd_defs.pl            |   6 -
 av1/encoder/arm/reconinter_enc_neon.c  |  13 --
 av1/encoder/reconinter_enc.c           |  24 ----
 av1/encoder/x86/reconinter_enc_ssse3.c |  67 -----------
 test/comp_avg_pred_test.cc             | 160 -------------------------
 6 files changed, 279 deletions(-)
 delete mode 100644 av1/encoder/x86/reconinter_enc_ssse3.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index e67ac8dff4..cdb97afc3f 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -330,9 +330,6 @@ if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH)
               "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
 endif()
 
-list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
-            "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
-
 list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
 
@@ -667,12 +664,6 @@ function(setup_av1_targets)
                                       "AOM_AV1_DECODER_INTRIN_SSSE3")
       endif()
     endif()
-    if(CONFIG_AV1_ENCODER)
-      if(AOM_AV1_ENCODER_INTRIN_SSSE3)
-        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder"
-                                      "AOM_AV1_ENCODER_INTRIN_SSSE3")
-      endif()
-    endif()
   endif()
 
   if(HAVE_SSE4_1)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 94a5171080..1254715f83 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -270,12 +270,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
                                                    int ref_stride, int subpel_search";
   specialize qw/aom_comp_avg_upsampled_pred sse2 neon/;
 
-  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3 neon/;
-
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                    const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
diff --git a/av1/encoder/arm/reconinter_enc_neon.c b/av1/encoder/arm/reconinter_enc_neon.c
index 4ebb34cb08..87e91460ab 100644
--- a/av1/encoder/arm/reconinter_enc_neon.c
+++ b/av1/encoder/arm/reconinter_enc_neon.c
@@ -138,19 +138,6 @@ void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd,
   aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_neon(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                          subpel_search);
-
-  aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
-                                  width, jcp_param);
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
                                     const struct AV1Common *const cm,
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 0396603ca1..4150ea4069 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -534,30 +534,6 @@ void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                      mask_stride, invert_mask);
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
                                  const struct AV1Common *const cm, int mi_row,
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
deleted file mode 100644
index f31c0eaa7e..0000000000
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-static inline void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
-                                        const __m128i *w, const __m128i *r,
-                                        void *const result) {
-  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
-  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
-  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
-  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
-
-  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
-  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
-  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
-  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
-}
-
-void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
-  const int8_t w1 = (int8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r = _mm_set1_epi16(round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 18c077b1ae..02e0210ad5 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -47,9 +47,6 @@ typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
 
 typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> AV1DistWtdCompAvgParam;
 
-typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
-    AV1DistWtdCompAvgUpsampledParam;
-
 typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -92,14 +89,6 @@ BuildParams(highbddistwtdcompavgupsampled_func filter) {
 }
 #endif  // HAVE_SSSE3
 
-#if HAVE_SSSE3 || HAVE_NEON
-::testing::internal::ParamGenerator<AV1DistWtdCompAvgUpsampledParam>
-BuildParams(distwtdcompavgupsampled_func filter) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-#endif  // HAVE_SSSE3 || HAVE_NEON
-
 class AV1DistWtdCompAvgTest
     : public ::testing::TestWithParam<AV1DistWtdCompAvgParam> {
  public:
@@ -205,135 +194,6 @@ class AV1DistWtdCompAvgTest
 
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DistWtdCompAvgTest);
 
-class AV1DistWtdCompAvgUpsampledTest
-    : public ::testing::TestWithParam<AV1DistWtdCompAvgUpsampledParam> {
- public:
-  ~AV1DistWtdCompAvgUpsampledTest() override = default;
-  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-    int sub_x_q3, sub_y_q3;
-    int subpel_search;
-    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
-         ++subpel_search) {
-      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-          for (int ii = 0; ii < 2; ii++) {
-            for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-              dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[jj][1 - ii];
-
-              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
-              aom_dist_wtd_comp_avg_upsampled_pred_c(
-                  nullptr, nullptr, 0, 0, nullptr, output,
-                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                  &dist_wtd_comp_params, subpel_search);
-              test_impl(nullptr, nullptr, 0, 0, nullptr, output2,
-                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                        &dist_wtd_comp_params, subpel_search);
-
-              for (int i = 0; i < in_h; ++i) {
-                for (int j = 0; j < in_w; ++j) {
-                  int idx = i * in_w + j;
-                  ASSERT_EQ(output[idx], output2[idx])
-                      << "Mismatch at unit tests for "
-                         "AV1DistWtdCompAvgUpsampledTest\n"
-                      << in_w << "x" << in_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << sub_y_q3 << ", "
-                      << sub_x_q3 << ")";
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    int sub_x_q3 = 0;
-    int sub_y_q3 = 0;
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_dist_wtd_comp_avg_upsampled_pred_c(
-          nullptr, nullptr, 0, 0, nullptr, output, pred8, in_w, in_h, sub_x_q3,
-          sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(nullptr, nullptr, 0, 0, nullptr, output2, pred8, in_w, in_h,
-                sub_x_q3, sub_y_q3, ref8, in_w, &dist_wtd_comp_params,
-                subpel_search);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1DistWtdCompAvgUpsampledTest
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DistWtdCompAvgUpsampledTest);
-
 class DistWtdCompAvgTest
     : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
       public ::testing::Test {
@@ -790,26 +650,6 @@ INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DistWtdCompAvgTest,
                          BuildParams(aom_dist_wtd_comp_avg_pred_ssse3));
 #endif
 
-TEST_P(AV1DistWtdCompAvgUpsampledTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0));
-}
-
-TEST_P(AV1DistWtdCompAvgUpsampledTest, CheckOutput) {
-  RunCheckOutput(GET_PARAM(0));
-}
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AV1DistWtdCompAvgUpsampledTest,
-    BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1DistWtdCompAvgUpsampledTest,
-    BuildParams(aom_dist_wtd_comp_avg_upsampled_pred_neon));
-#endif  // HAVE_NEON
-
 TEST_P(DistWtdCompAvgTest, MaxRef) {
   FillConstant(reference_data_, reference_stride_, mask_);
   FillConstant(second_pred_, width_, 0);
-- 
GitLab


From 9d8efc487a160c53280f2b05185d5239197c6c3c Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Wed, 21 Aug 2024 11:47:03 -0700
Subject: [PATCH 388/391] Update AUTHORS, CHANGELOG, and version for v3.10.0

Bug: 361339153
Change-Id: I660191db35bbc1a6ce2415f550f0c059b505b743
---
 AUTHORS        |  1 +
 CHANGELOG      | 23 ++++++++++++++++++++++-
 CMakeLists.txt |  6 +++---
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 32fe11ab45..4907baf1eb 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -61,6 +61,7 @@ David Michael Barr <b@rr-dav.id.au>
 David Turner <david.turner@argondesign.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
+Denis Nikitin <denik@google.com>
 Di Chen <chendixi@google.com>
 Diksha Singh <diksha.singh@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
diff --git a/CHANGELOG b/CHANGELOG
index eb4d0b1052..0c3769e65d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,28 @@
-yyyy-mm-dd v3.10.0
+2024-08-21 v3.10.0
+  This release includes new codec interfaces, compression efficiency and
+  perceptual improvements, speedup and memory optimizations and many bug
+  fixes. This release is ABI compatible with the last release.
+
   The definitions of the internal macros AOM_INLINE and AOM_FORCE_INLINE
   have been removed from the public header aom/aom_integer.h.
 
+  - New Features
+    * New codec controls:
+      * AV1E_SET_AUTO_TILES
+      * AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC
+      * AV1E_SET_POSTENCODE_DROP_RTC
+      * AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR
+    * New key-value pair for aom_codec_set_option():
+      * "auto-tiles": equivalent to the new codec control
+        AV1E_SET_AUTO_TILES.
+
+  - Deprecated Features
+    * Deprecated codec control:
+      * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Use the new codec control
+        AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR instead.
+    * The sframe_mode field in the aom_codec_enc_cfg_t struct is not
+      implemented.
+
 2024-06-07 v3.8.3
   This release includes several bug fixes. This release is ABI
   compatible with the last release. See
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f9a1b3958..b21b3f9245 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,9 +58,9 @@ endif()
 # passed to libtool.
 #
 # We set SO_FILE_VERSION = [c-a].a.r
-set(LT_CURRENT 12)
-set(LT_REVISION 1)
-set(LT_AGE 9)
+set(LT_CURRENT 13)
+set(LT_REVISION 0)
+set(LT_AGE 10)
 math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
 set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
 unset(LT_CURRENT)
-- 
GitLab


From 4b8391e46030cb58057ad186ccf7bbde81fbcaf7 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 19 Aug 2024 11:19:15 -0700
Subject: [PATCH 389/391] rtc: Bugfix for active_maps with sb_size=128

The scene detection processes the frame over 64x64 blocks,
so for active/inactive check, the parameters (sh, num_4x4)
should always be that for 64x64, regardless of the
seq_params->sb_size.

This was causing quality and speed regression for
active_maps with sb_size=128.

Change-Id: I216f034f1e39601c811015694b07a3f5e8262399
(cherry picked from commit 35c90ab674cab2c61bc9dd616ec2aaca2ed2b898)
---
 av1/encoder/ratectrl.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index f1891bfef7..0c6a008d2d 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -3044,11 +3044,13 @@ void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) {
     cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7);
 }
 
+// Returns whether the 64x64 block is active or inactive: used
+// by the scene detection, which is over 64x64 blocks.
 static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols,
-                               int mi_rows, int sbi_col, int sbi_row, int sh,
-                               int num_4x4) {
-  int r = sbi_row << sh;
-  int c = sbi_col << sh;
+                               int mi_rows, int sbi_col, int sbi_row) {
+  int num_4x4 = 16;
+  int r = sbi_row << 4;
+  int c = sbi_col << 4;
   const int row_max = AOMMIN(num_4x4, mi_rows - r);
   const int col_max = AOMMIN(num_4x4, mi_cols - c);
   // Active map is set for 16x16 blocks, so only need to
@@ -3241,8 +3243,6 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
   const int mi_cols = mi_params->mi_cols;
   const int mi_rows = mi_params->mi_rows;
-  int sh = (cm->seq_params->sb_size == BLOCK_128X128) ? 5 : 4;
-  int num_4x4 = (cm->seq_params->sb_size == BLOCK_128X128) ? 32 : 16;
   unsigned char *const active_map_4x4 = cpi->active_map.map;
   // Avoid bottom and right border.
   for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) {
@@ -3250,7 +3250,7 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
       int block_is_active = 1;
       if (cpi->active_map.enabled && rc->percent_blocks_inactive > 0) {
         block_is_active = set_block_is_active(active_map_4x4, mi_cols, mi_rows,
-                                              sbi_col, sbi_row, sh, num_4x4);
+                                              sbi_col, sbi_row);
       }
       if (block_is_active) {
         tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-- 
GitLab


From a2e1a6cf4786af9060d1f40b8d169b76a841226d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Mon, 26 Aug 2024 11:50:20 -0700
Subject: [PATCH 390/391] Update CHANGELOG for libaom v3.10.0

Bug: 361339153
Change-Id: Icca331d511c73185067453a9a43a09848a94ef7c
---
 CHANGELOG | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 0c3769e65d..5599845b15 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,4 @@
-2024-08-21 v3.10.0
+2024-08-26 v3.10.0
   This release includes new codec interfaces, compression efficiency and
   perceptual improvements, speedup and memory optimizations and many bug
   fixes. This release is ABI compatible with the last release.
@@ -10,7 +10,7 @@
     * New codec controls:
       * AV1E_SET_AUTO_TILES
       * AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC
-      * AV1E_SET_POSTENCODE_DROP_RTC
+      * AV1E_SET_POSTENCODE_DROP_RTC: Post encode frame drop feature.
       * AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR
     * New key-value pair for aom_codec_set_option():
       * "auto-tiles": equivalent to the new codec control
@@ -23,6 +23,28 @@
     * The sframe_mode field in the aom_codec_enc_cfg_t struct is not
       implemented.
 
+  - Compression Efficiency Improvements
+    * BD-rate gain of 0.7 - 1.3% (by enabling global motion tool) for
+      speed 5 and speed 6 with ~5% encode time increase.
+    * RTC speed 11 video: ~3-5% BD-rate gain for VGA and QVGA.
+
+  - Perceptual Quality Improvements
+    * RTC quality improvements for slide changes and scrolling content.
+
+  - Speedup and Memory Optimizations
+    * RTC screen content speedups:
+      * ~2x speedup for high motion content for speed 11.
+      * ~2x speedup on key frame coding for speed >= 10.
+
+  - Other Improvements
+    * Reduce bit rate overshoot on slide content.
+
+  - Bug Fixes
+    * rtc: Bug fix for active_maps with sb_size=128.
+    * b:343429036: rtc: Fix source_sad setting near boundary.
+    * Fix to QP for temporal enhancement after key frame.
+    * b:343429192: rtc: Condition QP adjustment on rc->q_1/2_frame > 0.
+
 2024-06-07 v3.8.3
   This release includes several bug fixes. This release is ABI
   compatible with the last release. See
-- 
GitLab


From c2fe6bf370f7c14fbaf12884b76244a3cfd7c5fc Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <wtc@google.com>
Date: Tue, 27 Aug 2024 11:10:42 -0700
Subject: [PATCH 391/391] Update CHANGELOG with Arm optimizations in v3.10.0

Bug: 361339153
Change-Id: I615706c3e84c5ed4c51df8eeddc7ba3bf93089b7
---
 CHANGELOG | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 5599845b15..8ffa7d22e3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,4 @@
-2024-08-26 v3.10.0
+2024-08-27 v3.10.0
   This release includes new codec interfaces, compression efficiency and
   perceptual improvements, speedup and memory optimizations and many bug
   fixes. This release is ABI compatible with the last release.
@@ -35,6 +35,36 @@
     * RTC screen content speedups:
       * ~2x speedup for high motion content for speed 11.
       * ~2x speedup on key frame coding for speed >= 10.
+    * Arm: Significant uplifts in speed in this release (vs v3.9.1) have
+      come from tuning the various convolutions according to filter size
+      (doing 8-tap when only 2-tap is required is inefficient) and also
+      deploying Armv8.6 USMMLA instructions in 6-tap and 12-tap standard
+      bitdepth convolutions.
+      * Standard bitdepth RTC:
+        * speed 5: +5%
+        * speed 6: +4%
+        * speed 7: +5%
+        * speed 8: +4%
+        * speed 9: +6%
+        * speed 10: +6%
+      * Standard bitdepth VoD:
+        * speed 0: +9%
+        * speed 1: +12%
+        * speed 2: +9%
+        * speed 3: +3%
+        * speed 4: +3%
+        * speed 5: -9% (expected due to global motion changes)
+        * speed 6: -3% (expected due to global motion changes)
+      * High bitdepth VoD:
+        * speed 0: +4%
+        * speed 1: +19%
+        * speed 2: +23%
+        * speed 3: +1%
+        * speed 4: +1%
+        * speed 5: -8% (expected due to global motion changes)
+        * speed 6: -3% (expected due to global motion changes)
+      * Standard bitdepth 2x1 horizontal super-resolution/scaling
+        encoding: +101%
 
   - Other Improvements
     * Reduce bit rate overshoot on slide content.
-- 
GitLab