From a668e81971a609cf7a2fbc9399388b013a23fd59 Mon Sep 17 00:00:00 2001 From: CrazyCat Date: Wed, 19 Mar 2025 19:49:50 +0200 Subject: [PATCH] media: pci/tbscapture2: Optimized for X86 SIMD. --- drivers/media/pci/tbscapture2/Makefile | 40 +- drivers/media/pci/tbscapture2/compare_msa.c | 97 - drivers/media/pci/tbscapture2/compare_neon.c | 96 - .../media/pci/tbscapture2/compare_neon64.c | 223 - drivers/media/pci/tbscapture2/compare_win.c | 241 - .../include/libyuv/loongson_intrinsics.h | 1949 ----- .../tbscapture2/include/libyuv/macros_msa.h | 244 - drivers/media/pci/tbscapture2/other.c | 30 +- drivers/media/pci/tbscapture2/rotate_lsx.c | 233 - drivers/media/pci/tbscapture2/rotate_msa.c | 240 - drivers/media/pci/tbscapture2/rotate_neon.c | 219 - drivers/media/pci/tbscapture2/rotate_neon64.c | 273 - drivers/media/pci/tbscapture2/rotate_sme.c | 174 - drivers/media/pci/tbscapture2/rotate_win.c | 253 - drivers/media/pci/tbscapture2/row_lasx.c | 2304 ------ drivers/media/pci/tbscapture2/row_lsx.c | 2987 -------- drivers/media/pci/tbscapture2/row_msa.c | 3597 --------- drivers/media/pci/tbscapture2/row_neon.c | 3981 ---------- drivers/media/pci/tbscapture2/row_neon64.c | 5364 -------------- drivers/media/pci/tbscapture2/row_rvv.c | 2599 ------- drivers/media/pci/tbscapture2/row_sve.c | 1409 ---- drivers/media/pci/tbscapture2/row_win.c | 6440 ----------------- drivers/media/pci/tbscapture2/scale_lsx.c | 739 -- drivers/media/pci/tbscapture2/scale_msa.c | 949 --- drivers/media/pci/tbscapture2/scale_neon.c | 1449 ---- drivers/media/pci/tbscapture2/scale_neon64.c | 1576 ---- drivers/media/pci/tbscapture2/scale_rvv.c | 1921 ----- drivers/media/pci/tbscapture2/scale_win.c | 1392 ---- 28 files changed, 24 insertions(+), 40995 deletions(-) delete mode 100644 drivers/media/pci/tbscapture2/compare_msa.c delete mode 100644 drivers/media/pci/tbscapture2/compare_neon.c delete mode 100644 drivers/media/pci/tbscapture2/compare_neon64.c delete mode 100644 drivers/media/pci/tbscapture2/compare_win.c delete mode 100644 drivers/media/pci/tbscapture2/include/libyuv/loongson_intrinsics.h delete mode 100644 drivers/media/pci/tbscapture2/include/libyuv/macros_msa.h delete mode 100644 drivers/media/pci/tbscapture2/rotate_lsx.c delete mode 100644 drivers/media/pci/tbscapture2/rotate_msa.c delete mode 100644 drivers/media/pci/tbscapture2/rotate_neon.c delete mode 100644 drivers/media/pci/tbscapture2/rotate_neon64.c delete mode 100644 drivers/media/pci/tbscapture2/rotate_sme.c delete mode 100644 drivers/media/pci/tbscapture2/rotate_win.c delete mode 100644 drivers/media/pci/tbscapture2/row_lasx.c delete mode 100644 drivers/media/pci/tbscapture2/row_lsx.c delete mode 100644 drivers/media/pci/tbscapture2/row_msa.c delete mode 100644 drivers/media/pci/tbscapture2/row_neon.c delete mode 100644 drivers/media/pci/tbscapture2/row_neon64.c delete mode 100644 drivers/media/pci/tbscapture2/row_rvv.c delete mode 100644 drivers/media/pci/tbscapture2/row_sve.c delete mode 100644 drivers/media/pci/tbscapture2/row_win.c delete mode 100644 drivers/media/pci/tbscapture2/scale_lsx.c delete mode 100644 drivers/media/pci/tbscapture2/scale_msa.c delete mode 100644 drivers/media/pci/tbscapture2/scale_neon.c delete mode 100644 drivers/media/pci/tbscapture2/scale_neon64.c delete mode 100644 drivers/media/pci/tbscapture2/scale_rvv.c delete mode 100644 drivers/media/pci/tbscapture2/scale_win.c diff --git a/drivers/media/pci/tbscapture2/Makefile b/drivers/media/pci/tbscapture2/Makefile index a05ef416411c..0f9d1b63a0f5 100644 --- a/drivers/media/pci/tbscapture2/Makefile +++ b/drivers/media/pci/tbscapture2/Makefile @@ -1,45 +1,35 @@ -#EXTRA_CFLAGS += -I drivers/media/dvb-core -#EXTRA_CFLAGS += -I drivers/media/pci/tbscapture/include -#EXTRA_CFLAGS += -I /lib/modules/`uname -r`/build/include -#EXTRA_CFLAGS += -I /lib/modules/`uname -r`/build/include/linux - ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include/libyuv ccflags-y += -I$(srctree)/include/linux -EXTRA_CFLAGS += -mhard-float -msse -msse2 +CFLAGS_X86 = -mhard-float -msse -msse2 + +CFLAGS_compare.o += $(CFLAGS_X86) +CFLAGS_compare_gcc.o += $(CFLAGS_X86) +CFLAGS_rotate_gcc.o += $(CFLAGS_X86) +CFLAGS_row_common.o += $(CFLAGS_X86) +CFLAGS_row_gcc.o += $(CFLAGS_X86) +CFLAGS_scale_gcc.o += $(CFLAGS_X86) +CFLAGS_planar_functions.o += $(CFLAGS_X86) -#CC_FLAGS_FPU := -mhard-float -msse -msse2 obj-$(CONFIG_TBS_PCIE2_CAP) += tbs_pcie2-cap.o tbs_pcie2-cap-objs += tbs_pcie2.o \ other.o \ compare.o \ compare_common.o \ compare_gcc.o \ -compare_msa.o \ -compare_neon.o \ -compare_neon64.o \ convert_from.o \ convert_from_argb.o \ convert_jpeg.o \ convert_to_argb.o \ -compare_win.o \ convert.o \ convert_argb.o \ rotate_argb.o \ rotate_common.o \ rotate_gcc.o \ -rotate_lsx.o \ -rotate_msa.o \ -rotate_neon.o \ -rotate_neon64.o \ -rotate_sme.o \ -rotate_win.o \ row_any.o \ row_common.o \ row_gcc.o \ -row_lasx.o \ -row_lsx.o \ mjpeg_validate.o \ planar_functions.o \ rotate.o \ @@ -47,23 +37,11 @@ rotate_any.o \ convert_to_i420.o \ cpu_id.o \ mjpeg_decoder.o \ -row_msa.o \ -row_neon.o \ -row_neon64.o \ -row_rvv.o \ -row_sve.o \ -row_win.o \ scale.o \ scale_any.o \ scale_argb.o \ scale_common.o \ scale_gcc.o \ -scale_lsx.o \ -scale_msa.o \ -scale_neon.o \ -scale_neon64.o \ scale_rgb.o \ -scale_rvv.o \ scale_uv.o \ -scale_win.o \ video_common.o diff --git a/drivers/media/pci/tbscapture2/compare_msa.c b/drivers/media/pci/tbscapture2/compare_msa.c deleted file mode 100644 index 1ff0253ca322..000000000000 --- a/drivers/media/pci/tbscapture2/compare_msa.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2017 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "basic_types.h" - -#include "compare_row.h" -#include "row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -uint32_t HammingDistance_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - int i; - v16u8 src0, src1, src2, src3; - v2i64 vec0 = {0}, vec1 = {0}; - - for (i = 0; i < count; i += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); - src0 ^= src2; - src1 ^= src3; - vec0 += __msa_pcnt_d((v2i64)src0); - vec1 += __msa_pcnt_d((v2i64)src1); - src_a += 32; - src_b += 32; - } - - vec0 += vec1; - diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); - diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); - return diff; -} - -uint32_t SumSquareError_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - int i; - v16u8 src0, src1, src2, src3; - v8i16 vec0, vec1, vec2, vec3; - v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; - v2i64 tmp0; - - for (i = 0; i < count; i += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); - vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); - reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); - reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); - reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); - reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); - src_a += 32; - src_b += 32; - } - - reg0 += reg1; - reg2 += reg3; - reg0 += reg2; - tmp0 = __msa_hadd_s_d(reg0, reg0); - sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); - sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); - return sse; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/drivers/media/pci/tbscapture2/compare_neon.c b/drivers/media/pci/tbscapture2/compare_neon.c deleted file mode 100644 index 66988a4349b6..000000000000 --- a/drivers/media/pci/tbscapture2/compare_neon.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "basic_types.h" - -#include "compare_row.h" -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// 256 bits at a time -// uses short accumulator which restricts count to 131 KB -uint32_t HammingDistance_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - - asm volatile ( - "vmov.u16 q4, #0 \n" // accumulator - - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" - - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" - - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) - : - : "cc", "q0", "q1", "q2", "q3", "q4"); - return diff; -} - -uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile ( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" - - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" - - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); - return sse; -} - -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/compare_neon64.c b/drivers/media/pci/tbscapture2/compare_neon64.c deleted file mode 100644 index a76881b5158c..000000000000 --- a/drivers/media/pci/tbscapture2/compare_neon64.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "basic_types.h" - -#include "compare_row.h" -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// 256 bits at a time -// uses short accumulator which restricts count to 131 KB -uint32_t HammingDistance_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - asm volatile ( - "movi v4.8h, #0 \n" - - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" - - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4"); - return diff; -} - -uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile ( - "movi v16.16b, #0 \n" - "movi v17.16b, #0 \n" - "movi v18.16b, #0 \n" - "movi v19.16b, #0 \n" - - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" - - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); - return sse; -} - -static const uvec32 kDjb2Multiplicands[] = { - {0x0c3525e1, // 33^15 - 0xa3476dc1, // 33^14 - 0x3b4039a1, // 33^13 - 0x4f5f0981}, // 33^12 - {0x30f35d61, // 33^11 - 0x855cb541, // 33^10 - 0x040a9121, // 33^9 - 0x747c7101}, // 33^8 - {0xec41d4e1, // 33^7 - 0x4cfa3cc1, // 33^6 - 0x025528a1, // 33^5 - 0x00121881}, // 33^4 - {0x00008c61, // 33^3 - 0x00000441, // 33^2 - 0x00000021, // 33^1 - 0x00000001}, // 33^0 -}; - -static const uvec32 kDjb2WidenIndices[] = { - {0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U}, - {0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U}, - {0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU}, - {0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU}, -}; - -uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { - uint32_t hash = seed; - const uint32_t c16 = 0x92d9e201; // 33^16 - uint32_t tmp, tmp2; - asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" - "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" - - // count is always a multiple of 16. - // maintain two accumulators, reduce and then final sum in scalar since - // this has better performance on little cores. - "1: \n" - "ldr q0, [%[src]], #16 \n" - "subs %w[count], %w[count], #16 \n" - "tbl v3.16b, {v0.16b}, v19.16b \n" - "tbl v2.16b, {v0.16b}, v18.16b \n" - "tbl v1.16b, {v0.16b}, v17.16b \n" - "tbl v0.16b, {v0.16b}, v16.16b \n" - "mul v3.4s, v3.4s, v7.4s \n" - "mul v2.4s, v2.4s, v6.4s \n" - "mla v3.4s, v1.4s, v5.4s \n" - "mla v2.4s, v0.4s, v4.4s \n" - "addv s1, v3.4s \n" - "addv s0, v2.4s \n" - "fmov %w[tmp2], s1 \n" - "fmov %w[tmp], s0 \n" - "add %w[tmp], %w[tmp], %w[tmp2] \n" - "madd %w[hash], %w[hash], %w[c16], %w[tmp] \n" - "b.gt 1b \n" - : [hash] "+r"(hash), // %[hash] - [count] "+r"(count), // %[count] - [tmp] "=&r"(tmp), // %[tmp] - [tmp2] "=&r"(tmp2) // %[tmp2] - : [src] "r"(src), // %[src] - [kMuls] "r"(kDjb2Multiplicands), // %[kMuls] - [kIdx] "r"(kDjb2WidenIndices), // %[kIdx] - [c16] "r"(c16) // %[c16] - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19"); - return hash; -} - -uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - asm volatile ( - "movi v4.4s, #0 \n" - "movi v5.4s, #0 \n" - "movi v6.16b, #1 \n" - - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "ldp q2, q3, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "udot v4.4s, v0.16b, v6.16b \n" - "udot v5.4s, v1.16b, v6.16b \n" - "b.gt 1b \n" - - "add v0.4s, v4.4s, v5.4s \n" - "addv s0, v0.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); - return diff; -} - -uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - // count is guaranteed to be a multiple of 32. - uint32_t sse; - asm volatile ( - "movi v4.4s, #0 \n" - "movi v5.4s, #0 \n" - - "1: \n" - "ldp q0, q2, [%0], #32 \n" - "ldp q1, q3, [%1], #32 \n" - "subs %w2, %w2, #32 \n" - "uabd v0.16b, v0.16b, v1.16b \n" - "uabd v1.16b, v2.16b, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "udot v4.4s, v0.16b, v0.16b \n" - "udot v5.4s, v1.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "b.gt 1b \n" - - "add v0.4s, v4.4s, v5.4s \n" - "addv s0, v0.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5"); - return sse; -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/compare_win.c b/drivers/media/pci/tbscapture2/compare_win.c deleted file mode 100644 index 98ef1e6adf2b..000000000000 --- a/drivers/media/pci/tbscapture2/compare_win.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "basic_types.h" - -#include "compare_row.h" -#include "row.h" - -#if defined(_MSC_VER) -#include // For __popcnt -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT - src_a += 4; - src_b += 4; - diff += __popcnt(x); - } - return diff; -} - -__declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - - wloop: - movdqu xmm1, [eax] - lea eax, [eax + 16] - movdqu xmm2, [edx] - lea edx, [edx + 16] - movdqa xmm3, xmm1 // abs trick - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - jg wloop - - pshufd xmm1, xmm0, 0xee - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 0x01 - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} - -#ifdef HAS_SUMSQUAREERROR_AVX2 -// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable : 4752) -__declspec(naked) uint32_t - SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - vpxor ymm0, ymm0, ymm0 // sum - vpxor ymm5, ymm5, ymm5 // constant 0 for unpck - sub edx, eax - - wloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + edx] - lea eax, [eax + 32] - vpsubusb ymm3, ymm1, ymm2 // abs difference trick - vpsubusb ymm2, ymm2, ymm1 - vpor ymm1, ymm2, ymm3 - vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. - vpunpckhbw ymm1, ymm1, ymm5 - vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. - vpmaddwd ymm1, ymm1, ymm1 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm0, ymm0, ymm2 - sub ecx, 32 - jg wloop - - vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpermq ymm1, ymm0, 0x02 // high + low lane. - vpaddd ymm0, ymm0, ymm1 - vmovd eax, xmm0 - vzeroupper - ret - } -} -#endif // HAS_SUMSQUAREERROR_AVX2 - -uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 -uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 -}; -uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 -}; -uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 -}; -uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 -}; - -__declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, xmmword ptr kHash16x33 - - wloop: - movdqu xmm1, [eax] // src[0-15] - lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, xmmword ptr kHashMul0 - movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] - movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] - pmulld xmm3, xmm5 - movdqa xmm5, xmmword ptr kHashMul1 - movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] - pmulld xmm4, xmm5 - movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] - movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] - pmulld xmm2, xmm5 - movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] - pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - sub ecx, 16 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} - -// Visual C 2012 required for AVX2. -#ifdef HAS_HASHDJB2_AVX2 -__declspec(naked) uint32_t - HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - vmovd xmm0, [esp + 12] // seed - - wloop: - vpmovzxbd xmm3, [eax] // src[0-3] - vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 - vpmovzxbd xmm4, [eax + 4] // src[4-7] - vpmulld xmm3, xmm3, xmmword ptr kHashMul0 - vpmovzxbd xmm2, [eax + 8] // src[8-11] - vpmulld xmm4, xmm4, xmmword ptr kHashMul1 - vpmovzxbd xmm1, [eax + 12] // src[12-15] - vpmulld xmm2, xmm2, xmmword ptr kHashMul2 - lea eax, [eax + 16] - vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm1, xmm1, xmm3 - vpshufd xmm2, xmm1, 0x0e // upper 2 dwords - vpaddd xmm1, xmm1,xmm2 - vpshufd xmm2, xmm1, 0x01 - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm0, xmm0, xmm1 - sub ecx, 16 - jg wloop - - vmovd eax, xmm0 // return hash - vzeroupper - ret - } -} -#endif // HAS_HASHDJB2_AVX2 - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/include/libyuv/loongson_intrinsics.h b/drivers/media/pci/tbscapture2/include/libyuv/loongson_intrinsics.h deleted file mode 100644 index 1d613defb1d0..000000000000 --- a/drivers/media/pci/tbscapture2/include/libyuv/loongson_intrinsics.h +++ /dev/null @@ -1,1949 +0,0 @@ -/* - * Copyright 2022 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H -#define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H - -/* - * Copyright (c) 2022 Loongson Technology Corporation Limited - * All rights reserved. - * Contributed by Shiyou Yin - * Xiwei Gu - * Lu Wang - * - * This file is a header file for loongarch builtin extension. - * - */ - -#ifndef LOONGSON_INTRINSICS_H -#define LOONGSON_INTRINSICS_H - -/** - * MAJOR version: Macro usage changes. - * MINOR version: Add new functions, or bug fixes. - * MICRO version: Comment changes or implementation changes. - */ -#define LSOM_VERSION_MAJOR 1 -#define LSOM_VERSION_MINOR 1 -#define LSOM_VERSION_MICRO 0 - -#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ - { \ - _OUT0 = _INS(_IN0); \ - _OUT1 = _INS(_IN1); \ - } - -#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ - { \ - _OUT0 = _INS(_IN0, _IN1); \ - _OUT1 = _INS(_IN2, _IN3); \ - } - -#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ - { \ - _OUT0 = _INS(_IN0, _IN1, _IN2); \ - _OUT1 = _INS(_IN3, _IN4, _IN5); \ - } - -#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ - { \ - DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ - DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ - } - -#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ - _OUT1, _OUT2, _OUT3) \ - { \ - DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ - DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ - } - -#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ - _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ - { \ - DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ - DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ - } - -#ifdef __loongarch_sx -#include -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Signed byte elements from in_h are multiplied by - * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Then the results plus to signed half-word elements from in_c. - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) - * in_c : 1,2,3,4, 1,2,3,4 - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 - * out : 23,40,41,26, 23,40,41,26 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, - __m128i in_h, - __m128i in_l) { - __m128i out; - - out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); - out = __lsx_vmaddwod_h_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied by - * unsigned byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * The results plus to signed half-word elements from in_c. - * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) - * in_c : 1,2,3,4, 1,2,3,4 - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 - * out : 23,40,41,26, 23,40,41,26 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, - __m128i in_h, - __m128i in_l) { - __m128i out; - - out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); - out = __lsx_vmaddwod_h_bu(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied by - * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * The results plus to signed half-word elements from in_c. - * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) - * in_c : 1,1,1,1, 1,1,1,1 - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 - * out : -4,-24,-60,-112, 6,26,62,114 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, - __m128i in_h, - __m128i in_l) { - __m128i out; - - out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); - out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of half-word vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Outputs - out - * Return Type - __m128i - * Details : Signed half-word elements from in_h are multiplied by - * signed half-word elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Then the results plus to signed word elements from in_c. - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) - * in_c : 1,2,3,4 - * in_h : 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1 - * out : 23,40,41,26 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, - __m128i in_h, - __m128i in_l) { - __m128i out; - - out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); - out = __lsx_vmaddwod_w_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Signed byte elements from in_h are multiplied by - * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Example : out = __lsx_vdp2_h_b(in_h, in_l) - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 - * out : 22,38,38,22, 22,38,38,22 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { - __m128i out; - - out = __lsx_vmulwev_h_b(in_h, in_l); - out = __lsx_vmaddwod_h_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied by - * unsigned byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Example : out = __lsx_vdp2_h_bu(in_h, in_l) - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 - * out : 22,38,38,22, 22,38,38,22 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { - __m128i out; - - out = __lsx_vmulwev_h_bu(in_h, in_l); - out = __lsx_vmaddwod_h_bu(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied by - * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 - * out : 22,38,38,22, 22,38,38,6 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { - __m128i out; - - out = __lsx_vmulwev_h_bu_b(in_h, in_l); - out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Outputs - out - * Return Type - halfword - * Details : Signed byte elements from in_h are multiplied by - * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Example : out = __lsx_vdp2_w_h(in_h, in_l) - * in_h : 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1 - * out : 22,38,38,22 - * ============================================================================= - */ -static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { - __m128i out; - - out = __lsx_vmulwev_w_h(in_h, in_l); - out = __lsx_vmaddwod_w_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Clip all halfword elements of input vector between min & max - * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : - * (_in)) - * Arguments : Inputs - _in (input vector) - * - min (min threshold) - * - max (max threshold) - * Outputs - out (output vector with clipped elements) - * Return Type - signed halfword - * Example : out = __lsx_vclip_h(_in) - * _in : -8,2,280,249, -8,255,280,249 - * min : 1,1,1,1, 1,1,1,1 - * max : 9,9,9,9, 9,9,9,9 - * out : 1,2,9,9, 1,9,9,9 - * ============================================================================= - */ -static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { - __m128i out; - - out = __lsx_vmax_h(min, _in); - out = __lsx_vmin_h(max, out); - return out; -} - -/* - * ============================================================================= - * Description : Set each element of vector between 0 and 255 - * Arguments : Inputs - _in - * Outputs - out - * Return Type - halfword - * Details : Signed byte elements from _in are clamped between 0 and 255. - * Example : out = __lsx_vclip255_h(_in) - * _in : -8,255,280,249, -8,255,280,249 - * out : 0,255,255,249, 0,255,255,249 - * ============================================================================= - */ -static inline __m128i __lsx_vclip255_h(__m128i _in) { - __m128i out; - - out = __lsx_vmaxi_h(_in, 0); - out = __lsx_vsat_hu(out, 7); - return out; -} - -/* - * ============================================================================= - * Description : Set each element of vector between 0 and 255 - * Arguments : Inputs - _in - * Outputs - out - * Return Type - word - * Details : Signed byte elements from _in are clamped between 0 and 255. - * Example : out = __lsx_vclip255_w(_in) - * _in : -8,255,280,249 - * out : 0,255,255,249 - * ============================================================================= - */ -static inline __m128i __lsx_vclip255_w(__m128i _in) { - __m128i out; - - out = __lsx_vmaxi_w(_in, 0); - out = __lsx_vsat_wu(out, 7); - return out; -} - -/* - * ============================================================================= - * Description : Swap two variables - * Arguments : Inputs - _in0, _in1 - * Outputs - _in0, _in1 (in-place) - * Details : Swapping of two input variables using xor - * Example : LSX_SWAP(_in0, _in1) - * _in0 : 1,2,3,4 - * _in1 : 5,6,7,8 - * _in0(out) : 5,6,7,8 - * _in1(out) : 1,2,3,4 - * ============================================================================= - */ -#define LSX_SWAP(_in0, _in1) \ - { \ - _in0 = __lsx_vxor_v(_in0, _in1); \ - _in1 = __lsx_vxor_v(_in0, _in1); \ - _in0 = __lsx_vxor_v(_in0, _in1); \ - } - -/* - * ============================================================================= - * Description : Transpose 4x4 block with word elements in vectors - * Arguments : Inputs - in0, in1, in2, in3 - * Outputs - out0, out1, out2, out3 - * Details : - * Example : - * 1, 2, 3, 4 1, 5, 9,13 - * 5, 6, 7, 8 to 2, 6,10,14 - * 9,10,11,12 =====> 3, 7,11,15 - * 13,14,15,16 4, 8,12,16 - * ============================================================================= - */ -#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - __m128i _t0, _t1, _t2, _t3; \ - \ - _t0 = __lsx_vilvl_w(_in1, _in0); \ - _t1 = __lsx_vilvh_w(_in1, _in0); \ - _t2 = __lsx_vilvl_w(_in3, _in2); \ - _t3 = __lsx_vilvh_w(_in3, _in2); \ - _out0 = __lsx_vilvl_d(_t2, _t0); \ - _out1 = __lsx_vilvh_d(_t2, _t0); \ - _out2 = __lsx_vilvl_d(_t3, _t1); \ - _out3 = __lsx_vilvh_d(_t3, _t1); \ - } - -/* - * ============================================================================= - * Description : Transpose 8x8 block with byte elements in vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 - * Details : The rows of the matrix become columns, and the columns - * become rows. - * Example : LSX_TRANSPOSE8x8_B - * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 - * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 - * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 - * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 - * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 - * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 - * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 - * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 - * - * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 - * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 - * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 - * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 - * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 - * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 - * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 - * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 - * ============================================================================= - */ -#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - __m128i zero = {0}; \ - __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \ - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ - \ - _t0 = __lsx_vilvl_b(_in2, _in0); \ - _t1 = __lsx_vilvl_b(_in3, _in1); \ - _t2 = __lsx_vilvl_b(_in6, _in4); \ - _t3 = __lsx_vilvl_b(_in7, _in5); \ - _t4 = __lsx_vilvl_b(_t1, _t0); \ - _t5 = __lsx_vilvh_b(_t1, _t0); \ - _t6 = __lsx_vilvl_b(_t3, _t2); \ - _t7 = __lsx_vilvh_b(_t3, _t2); \ - _out0 = __lsx_vilvl_w(_t6, _t4); \ - _out2 = __lsx_vilvh_w(_t6, _t4); \ - _out4 = __lsx_vilvl_w(_t7, _t5); \ - _out6 = __lsx_vilvh_w(_t7, _t5); \ - _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ - _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ - _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ - _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ - } - -/* - * ============================================================================= - * Description : Transpose 8x8 block with half-word elements in vectors - * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - * Details : - * Example : - * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 - * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 - * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 - * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 - * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 - * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 - * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 - * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 - * ============================================================================= - */ -#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ - \ - _s0 = __lsx_vilvl_h(_in6, _in4); \ - _s1 = __lsx_vilvl_h(_in7, _in5); \ - _t0 = __lsx_vilvl_h(_s1, _s0); \ - _t1 = __lsx_vilvh_h(_s1, _s0); \ - _s0 = __lsx_vilvh_h(_in6, _in4); \ - _s1 = __lsx_vilvh_h(_in7, _in5); \ - _t2 = __lsx_vilvl_h(_s1, _s0); \ - _t3 = __lsx_vilvh_h(_s1, _s0); \ - _s0 = __lsx_vilvl_h(_in2, _in0); \ - _s1 = __lsx_vilvl_h(_in3, _in1); \ - _t4 = __lsx_vilvl_h(_s1, _s0); \ - _t5 = __lsx_vilvh_h(_s1, _s0); \ - _s0 = __lsx_vilvh_h(_in2, _in0); \ - _s1 = __lsx_vilvh_h(_in3, _in1); \ - _t6 = __lsx_vilvl_h(_s1, _s0); \ - _t7 = __lsx_vilvh_h(_s1, _s0); \ - \ - _out0 = __lsx_vpickev_d(_t0, _t4); \ - _out2 = __lsx_vpickev_d(_t1, _t5); \ - _out4 = __lsx_vpickev_d(_t2, _t6); \ - _out6 = __lsx_vpickev_d(_t3, _t7); \ - _out1 = __lsx_vpickod_d(_t0, _t4); \ - _out3 = __lsx_vpickod_d(_t1, _t5); \ - _out5 = __lsx_vpickod_d(_t2, _t6); \ - _out7 = __lsx_vpickod_d(_t3, _t7); \ - } - -/* - * ============================================================================= - * Description : Transpose input 8x4 byte block into 4x8 - * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) - * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) - * Return Type - as per RTYPE - * Details : The rows of the matrix become columns, and the columns become - * rows. - * Example : LSX_TRANSPOSE8x4_B - * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 - * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 - * - * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 - * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 - * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 - * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 - * ============================================================================= - */ -#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3) \ - { \ - __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - \ - _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ - _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ - _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ - _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ - _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ - \ - _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ - _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ - _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ - \ - _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ - _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ - _out1 = __lsx_vilvh_d(_out2, _out0); \ - _out3 = __lsx_vilvh_d(_out0, _out2); \ - } - -/* - * ============================================================================= - * Description : Transpose 16x8 block with byte elements in vectors - * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 - * in9, in10, in11, in12, in13, in14, in15 - * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - * Details : - * Example : - * 000,001,002,003,004,005,006,007 - * 008,009,010,011,012,013,014,015 - * 016,017,018,019,020,021,022,023 - * 024,025,026,027,028,029,030,031 - * 032,033,034,035,036,037,038,039 - * 040,041,042,043,044,045,046,047 000,008,...,112,120 - * 048,049,050,051,052,053,054,055 001,009,...,113,121 - * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 - * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 - * 072,073,074,075,076,077,078,079 004,012,...,116,124 - * 080,081,082,083,084,085,086,087 005,013,...,117,125 - * 088,089,090,091,092,093,094,095 006,014,...,118,126 - * 096,097,098,099,100,101,102,103 007,015,...,119,127 - * 104,105,106,107,108,109,110,111 - * 112,113,114,115,116,117,118,119 - * 120,121,122,123,124,125,126,127 - * ============================================================================= - */ -#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ - _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ - _out6, _out7) \ - { \ - __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ - DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ - _tmp0, _tmp1, _tmp2, _tmp3); \ - DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ - _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ - DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ - DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ - DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ - DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ - DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ - DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ - DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ - DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ - DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ - DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ - DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ - DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ - } - -/* - * ============================================================================= - * Description : Butterfly of 4 input vectors - * Arguments : Inputs - in0, in1, in2, in3 - * Outputs - out0, out1, out2, out3 - * Details : Butterfly operation - * Example : - * out0 = in0 + in3; - * out1 = in1 + in2; - * out2 = in1 - in2; - * out3 = in0 - in3; - * ============================================================================= - */ -#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lsx_vadd_b(_in0, _in3); \ - _out1 = __lsx_vadd_b(_in1, _in2); \ - _out2 = __lsx_vsub_b(_in1, _in2); \ - _out3 = __lsx_vsub_b(_in0, _in3); \ - } -#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lsx_vadd_h(_in0, _in3); \ - _out1 = __lsx_vadd_h(_in1, _in2); \ - _out2 = __lsx_vsub_h(_in1, _in2); \ - _out3 = __lsx_vsub_h(_in0, _in3); \ - } -#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lsx_vadd_w(_in0, _in3); \ - _out1 = __lsx_vadd_w(_in1, _in2); \ - _out2 = __lsx_vsub_w(_in1, _in2); \ - _out3 = __lsx_vsub_w(_in0, _in3); \ - } -#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lsx_vadd_d(_in0, _in3); \ - _out1 = __lsx_vadd_d(_in1, _in2); \ - _out2 = __lsx_vsub_d(_in1, _in2); \ - _out3 = __lsx_vsub_d(_in0, _in3); \ - } - -/* - * ============================================================================= - * Description : Butterfly of 8 input vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ - * Outputs - _out0, _out1, _out2, _out3, ~ - * Details : Butterfly operation - * Example : - * _out0 = _in0 + _in7; - * _out1 = _in1 + _in6; - * _out2 = _in2 + _in5; - * _out3 = _in3 + _in4; - * _out4 = _in3 - _in4; - * _out5 = _in2 - _in5; - * _out6 = _in1 - _in6; - * _out7 = _in0 - _in7; - * ============================================================================= - */ -#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lsx_vadd_b(_in0, _in7); \ - _out1 = __lsx_vadd_b(_in1, _in6); \ - _out2 = __lsx_vadd_b(_in2, _in5); \ - _out3 = __lsx_vadd_b(_in3, _in4); \ - _out4 = __lsx_vsub_b(_in3, _in4); \ - _out5 = __lsx_vsub_b(_in2, _in5); \ - _out6 = __lsx_vsub_b(_in1, _in6); \ - _out7 = __lsx_vsub_b(_in0, _in7); \ - } - -#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lsx_vadd_h(_in0, _in7); \ - _out1 = __lsx_vadd_h(_in1, _in6); \ - _out2 = __lsx_vadd_h(_in2, _in5); \ - _out3 = __lsx_vadd_h(_in3, _in4); \ - _out4 = __lsx_vsub_h(_in3, _in4); \ - _out5 = __lsx_vsub_h(_in2, _in5); \ - _out6 = __lsx_vsub_h(_in1, _in6); \ - _out7 = __lsx_vsub_h(_in0, _in7); \ - } - -#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lsx_vadd_w(_in0, _in7); \ - _out1 = __lsx_vadd_w(_in1, _in6); \ - _out2 = __lsx_vadd_w(_in2, _in5); \ - _out3 = __lsx_vadd_w(_in3, _in4); \ - _out4 = __lsx_vsub_w(_in3, _in4); \ - _out5 = __lsx_vsub_w(_in2, _in5); \ - _out6 = __lsx_vsub_w(_in1, _in6); \ - _out7 = __lsx_vsub_w(_in0, _in7); \ - } - -#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lsx_vadd_d(_in0, _in7); \ - _out1 = __lsx_vadd_d(_in1, _in6); \ - _out2 = __lsx_vadd_d(_in2, _in5); \ - _out3 = __lsx_vadd_d(_in3, _in4); \ - _out4 = __lsx_vsub_d(_in3, _in4); \ - _out5 = __lsx_vsub_d(_in2, _in5); \ - _out6 = __lsx_vsub_d(_in1, _in6); \ - _out7 = __lsx_vsub_d(_in0, _in7); \ - } - -#endif // LSX - -#ifdef __loongarch_asx -#include -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed halfword - * Details : Unsigned byte elements from in_h are multiplied with - * unsigned byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements - * are added to the out vector - * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_h_bu(in_h, in_l); - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of byte vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed halfword - * Details : Signed byte elements from in_h are multiplied with - * signed byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Then this multiplication results of adjacent odd-even elements - * are added to the out vector - * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_h_b(in_h, in_l); - out = __lasx_xvmaddwod_h_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed word - * Details : Signed halfword elements from in_h are multiplied with - * signed halfword elements from in_l producing a result - * twice the size of input i.e. signed word. - * Then this multiplied results of adjacent odd-even elements - * are added to the out vector. - * Example : out = __lasx_xvdp2_w_h(in_h, in_l) - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 - * out : 22,38,38,22, 22,38,38,22 - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_w_h(in_h, in_l); - out = __lasx_xvmaddwod_w_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of word vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed double - * Details : Signed word elements from in_h are multiplied with - * signed word elements from in_l producing a result - * twice the size of input i.e. signed double-word. - * Then this multiplied results of adjacent odd-even elements - * are added to the out vector. - * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_d_w(in_h, in_l); - out = __lasx_xvmaddwod_d_w(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed word - * Details : Unsigned halfword elements from in_h are multiplied with - * signed halfword elements from in_l producing a result - * twice the size of input i.e. unsigned word. - * Multiplication result of adjacent odd-even elements - * are added to the out vector - * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_w_hu_h(in_h, in_l); - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - halfword - * Details : Signed byte elements from in_h are multiplied with - * signed byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements - * are added to the in_c vector. - * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); - out = __lasx_xvmaddwod_h_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied with - * unsigned byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements - * are added to the in_c vector. - * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product & addition of byte vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - halfword - * Details : Unsigned byte elements from in_h are multiplied with - * signed byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements - * are added to the in_c vector. - * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); - out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Return Type - per RTYPE - * Details : Signed halfword elements from in_h are multiplied with - * signed halfword elements from in_l producing a result - * twice the size of input i.e. signed word. - * Multiplication result of adjacent odd-even elements - * are added to the in_c vector. - * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * in_c : 1,2,3,4, 1,2,3,4 - * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, - * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, - * out : 23,40,41,26, 23,40,41,26 - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); - out = __lasx_xvmaddwod_w_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Return Type - signed word - * Details : Unsigned halfword elements from in_h are multiplied with - * unsigned halfword elements from in_l producing a result - * twice the size of input i.e. signed word. - * Multiplication result of adjacent odd-even elements - * are added to the in_c vector. - * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); - out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Return Type - signed word - * Details : Unsigned halfword elements from in_h are multiplied with - * signed halfword elements from in_l producing a result - * twice the size of input i.e. signed word. - * Multiplication result of adjacent odd-even elements - * are added to the in_c vector - * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); - return out; -} - -/* - * ============================================================================= - * Description : Vector Unsigned Dot Product and Subtract - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Return Type - signed halfword - * Details : Unsigned byte elements from in_h are multiplied with - * unsigned byte elements from in_l producing a result - * twice the size of input i.e. signed halfword. - * Multiplication result of adjacent odd-even elements - * are added together and subtracted from double width elements - * in_c vector. - * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_h_bu(in_h, in_l); - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); - out = __lasx_xvsub_h(in_c, out); - return out; -} - -/* - * ============================================================================= - * Description : Vector Signed Dot Product and Subtract - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Return Type - signed word - * Details : Signed halfword elements from in_h are multiplied with - * Signed halfword elements from in_l producing a result - * twice the size of input i.e. signed word. - * Multiplication result of adjacent odd-even elements - * are added together and subtracted from double width elements - * in_c vector. - * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) - * in_c : 0,0,0,0, 0,0,0,0 - * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 - * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 - * out : -7,-3,0,0, 0,-1,0,-1 - * ============================================================================= - */ -static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_w_h(in_h, in_l); - out = __lasx_xvmaddwod_w_h(out, in_h, in_l); - out = __lasx_xvsub_w(in_c, out); - return out; -} - -/* - * ============================================================================= - * Description : Dot product of halfword vector elements - * Arguments : Inputs - in_h, in_l - * Output - out - * Return Type - signed word - * Details : Signed halfword elements from in_h are multiplied with - * signed halfword elements from in_l producing a result - * four times the size of input i.e. signed doubleword. - * Then this multiplication results of four adjacent elements - * are added together and stored to the out vector. - * Example : out = __lasx_xvdp4_d_h(in_h, in_l) - * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 - * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 - * out : -2,0,1,1 - * ============================================================================= - */ -static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvmulwev_w_h(in_h, in_l); - out = __lasx_xvmaddwod_w_h(out, in_h, in_l); - out = __lasx_xvhaddw_d_w(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The high half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are added after the - * higher half of the two-fold sign extension (signed byte - * to signed halfword) and stored to the out vector. - * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvilvh_b(in_h, in_l); - out = __lasx_xvhaddw_h_b(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The high half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are added after the - * higher half of the two-fold sign extension (signed halfword - * to signed word) and stored to the out vector. - * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) - * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 - * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 - * out : 1,0,0,-1, 1,0,0, 2 - * ============================================================================= - */ -static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvilvh_h(in_h, in_l); - out = __lasx_xvhaddw_w_h(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are added after the - * lower half of the two-fold sign extension (signed byte - * to signed halfword) and stored to the out vector. - * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvilvl_b(in_h, in_l); - out = __lasx_xvhaddw_h_b(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are added after the - * lower half of the two-fold sign extension (signed halfword - * to signed word) and stored to the out vector. - * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) - * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 - * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 - * out : 5,-1,4,2, 1,0,2,-1 - * ============================================================================= - */ -static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvilvl_h(in_h, in_l); - out = __lasx_xvhaddw_w_h(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The out vector and the out vector are added after the - * lower half of the two-fold zero extension (unsigned byte - * to unsigned halfword) and stored to the out vector. - * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvilvl_b(in_h, in_l); - out = __lasx_xvhaddw_hu_bu(out, out); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_l vector after double zero extension (unsigned byte to - * signed halfword),added to the in_h vector. - * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvsllwil_hu_bu(in_l, 0); - out = __lasx_xvadd_h(in_h, out); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added after being doubled. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_l vector after double sign extension (signed halfword to - * signed word), added to the in_h vector. - * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) - * in_h : 0, 1,0,0, -1,0,0,1, - * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, - * out : 2, 0,1,2, -1,0,1,1, - * ============================================================================= - */ -static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { - __m256i out; - - out = __lasx_xvsllwil_w_h(in_l, 0); - out = __lasx_xvadd_w(in_h, out); - return out; -} - -/* - * ============================================================================= - * Description : Multiplication and addition calculation after expansion - * of the lower half of the vector. - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are multiplied after - * the lower half of the two-fold sign extension (signed halfword - * to signed word), and the result is added to the vector in_c, - * then stored to the out vector. - * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) - * in_c : 1,2,3,4, 5,6,7,8 - * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 - * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, - * -200,-300,-400,-500, -2000,-3000,-4000,-5000 - * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 - * ============================================================================= - */ -static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i tmp0, tmp1, out; - - tmp0 = __lasx_xvsllwil_w_h(in_h, 0); - tmp1 = __lasx_xvsllwil_w_h(in_l, 0); - tmp0 = __lasx_xvmul_w(tmp0, tmp1); - out = __lasx_xvadd_w(tmp0, in_c); - return out; -} - -/* - * ============================================================================= - * Description : Multiplication and addition calculation after expansion - * of the higher half of the vector. - * Arguments : Inputs - in_c, in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are multiplied after - * the higher half of the two-fold sign extension (signed - * halfword to signed word), and the result is added to - * the vector in_c, then stored to the out vector. - * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) - * ============================================================================= - */ -static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, - __m256i in_h, - __m256i in_l) { - __m256i tmp0, tmp1, out; - - tmp0 = __lasx_xvilvh_h(in_h, in_h); - tmp1 = __lasx_xvilvh_h(in_l, in_l); - tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); - out = __lasx_xvadd_w(tmp0, in_c); - return out; -} - -/* - * ============================================================================= - * Description : Multiplication calculation after expansion of the lower - * half of the vector. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are multiplied after - * the lower half of the two-fold sign extension (signed - * halfword to signed word), then stored to the out vector. - * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) - * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 - * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 - * out : 6,1,3,0, 0,0,1,0 - * ============================================================================= - */ -static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { - __m256i tmp0, tmp1, out; - - tmp0 = __lasx_xvsllwil_w_h(in_h, 0); - tmp1 = __lasx_xvsllwil_w_h(in_l, 0); - out = __lasx_xvmul_w(tmp0, tmp1); - return out; -} - -/* - * ============================================================================= - * Description : Multiplication calculation after expansion of the lower - * half of the vector. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector and the in_l vector are multiplied after - * the lower half of the two-fold sign extension (signed - * halfword to signed word), then stored to the out vector. - * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) - * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 - * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 - * out : 0,0,0,0, 0,0,0,1 - * ============================================================================= - */ -static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { - __m256i tmp0, tmp1, out; - - tmp0 = __lasx_xvilvh_h(in_h, in_h); - tmp1 = __lasx_xvilvh_h(in_l, in_l); - out = __lasx_xvmulwev_w_h(tmp0, tmp1); - return out; -} - -/* - * ============================================================================= - * Description : The low half of the vector elements are added to the high half - * after being doubled, then saturated. - * Arguments : Inputs - in_h, in_l - * Output - out - * Details : The in_h vector adds the in_l vector after the lower half of - * the two-fold zero extension (unsigned byte to unsigned - * halfword) and then saturated. The results are stored to the out - * vector. - * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) - * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 - * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, - * 0,0,0,1 - * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, - * ============================================================================= - */ -static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { - __m256i tmp1, out; - __m256i zero = {0}; - - tmp1 = __lasx_xvilvl_b(zero, in_l); - out = __lasx_xvsadd_hu(in_h, tmp1); - return out; -} - -/* - * ============================================================================= - * Description : Clip all halfword elements of input vector between min & max - * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) - * Arguments : Inputs - in (input vector) - * - min (min threshold) - * - max (max threshold) - * Outputs - in (output vector with clipped elements) - * Return Type - signed halfword - * Example : out = __lasx_xvclip_h(in, min, max) - * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 - * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 - * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 - * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 - * ============================================================================= - */ -static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { - __m256i out; - - out = __lasx_xvmax_h(min, in); - out = __lasx_xvmin_h(max, out); - return out; -} - -/* - * ============================================================================= - * Description : Clip all signed halfword elements of input vector - * between 0 & 255 - * Arguments : Inputs - in (input vector) - * Outputs - out (output vector with clipped elements) - * Return Type - signed halfword - * Example : See out = __lasx_xvclip255_w(in) - * ============================================================================= - */ -static inline __m256i __lasx_xvclip255_h(__m256i in) { - __m256i out; - - out = __lasx_xvmaxi_h(in, 0); - out = __lasx_xvsat_hu(out, 7); - return out; -} - -/* - * ============================================================================= - * Description : Clip all signed word elements of input vector - * between 0 & 255 - * Arguments : Inputs - in (input vector) - * Output - out (output vector with clipped elements) - * Return Type - signed word - * Example : out = __lasx_xvclip255_w(in) - * in : -8,255,280,249, -8,255,280,249 - * out : 0,255,255,249, 0,255,255,249 - * ============================================================================= - */ -static inline __m256i __lasx_xvclip255_w(__m256i in) { - __m256i out; - - out = __lasx_xvmaxi_w(in, 0); - out = __lasx_xvsat_wu(out, 7); - return out; -} - -/* - * ============================================================================= - * Description : Indexed halfword element values are replicated to all - * elements in output vector. If 'idx < 8' use xvsplati_l_*, - * if 'idx >= 8' use xvsplati_h_*. - * Arguments : Inputs - in, idx - * Output - out - * Details : Idx element value from in vector is replicated to all - * elements in out vector. - * Valid index range for halfword operation is 0-7 - * Example : out = __lasx_xvsplati_l_h(in, idx) - * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 - * idx : 0x02 - * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 - * ============================================================================= - */ -static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { - __m256i out; - - out = __lasx_xvpermi_q(in, in, 0x02); - out = __lasx_xvreplve_h(out, idx); - return out; -} - -/* - * ============================================================================= - * Description : Indexed halfword element values are replicated to all - * elements in output vector. If 'idx < 8' use xvsplati_l_*, - * if 'idx >= 8' use xvsplati_h_*. - * Arguments : Inputs - in, idx - * Output - out - * Details : Idx element value from in vector is replicated to all - * elements in out vector. - * Valid index range for halfword operation is 0-7 - * Example : out = __lasx_xvsplati_h_h(in, idx) - * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 - * idx : 0x09 - * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 - * ============================================================================= - */ -static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { - __m256i out; - - out = __lasx_xvpermi_q(in, in, 0x13); - out = __lasx_xvreplve_h(out, idx); - return out; -} - -/* - * ============================================================================= - * Description : Transpose 4x4 block with double-word elements in vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3 - * Outputs - _out0, _out1, _out2, _out3 - * Example : LASX_TRANSPOSE4x4_D - * _in0 : 1,2,3,4 - * _in1 : 1,2,3,4 - * _in2 : 1,2,3,4 - * _in3 : 1,2,3,4 - * - * _out0 : 1,1,1,1 - * _out1 : 2,2,2,2 - * _out2 : 3,3,3,3 - * _out3 : 4,4,4,4 - * ============================================================================= - */ -#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ - _out3) \ - { \ - __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ - _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ - _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ - _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ - _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ - _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ - _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ - _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ - _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ - } - -/* - * ============================================================================= - * Description : Transpose 8x8 block with word elements in vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 - * Example : LASX_TRANSPOSE8x8_W - * _in0 : 1,2,3,4,5,6,7,8 - * _in1 : 2,2,3,4,5,6,7,8 - * _in2 : 3,2,3,4,5,6,7,8 - * _in3 : 4,2,3,4,5,6,7,8 - * _in4 : 5,2,3,4,5,6,7,8 - * _in5 : 6,2,3,4,5,6,7,8 - * _in6 : 7,2,3,4,5,6,7,8 - * _in7 : 8,2,3,4,5,6,7,8 - * - * _out0 : 1,2,3,4,5,6,7,8 - * _out1 : 2,2,2,2,2,2,2,2 - * _out2 : 3,3,3,3,3,3,3,3 - * _out3 : 4,4,4,4,4,4,4,4 - * _out4 : 5,5,5,5,5,5,5,5 - * _out5 : 6,6,6,6,6,6,6,6 - * _out6 : 7,7,7,7,7,7,7,7 - * _out7 : 8,8,8,8,8,8,8,8 - * ============================================================================= - */ -#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - __m256i _s0_m, _s1_m; \ - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ - \ - _s0_m = __lasx_xvilvl_w(_in2, _in0); \ - _s1_m = __lasx_xvilvl_w(_in3, _in1); \ - _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ - _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ - _s0_m = __lasx_xvilvh_w(_in2, _in0); \ - _s1_m = __lasx_xvilvh_w(_in3, _in1); \ - _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ - _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ - _s0_m = __lasx_xvilvl_w(_in6, _in4); \ - _s1_m = __lasx_xvilvl_w(_in7, _in5); \ - _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ - _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ - _s0_m = __lasx_xvilvh_w(_in6, _in4); \ - _s1_m = __lasx_xvilvh_w(_in7, _in5); \ - _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ - _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ - _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ - _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ - _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ - _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ - _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ - _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ - } - -/* - * ============================================================================= - * Description : Transpose input 16x8 byte block - * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, - * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 - * (input 16x8 byte block) - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x16 byte block) - * Details : The rows of the matrix become columns, and the columns become - * rows. - * Example : See LASX_TRANSPOSE16x8_H - * ============================================================================= - */ -#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ - _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ - _out6, _out7) \ - { \ - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ - \ - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ - _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ - _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ - _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ - _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ - _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ - _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ - _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ - _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ - _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ - _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ - _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ - _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ - _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ - _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ - _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ - _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ - _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ - _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ - _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ - _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ - _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ - _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ - _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ - _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ - _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ - _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ - _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ - _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ - } - -/* - * ============================================================================= - * Description : Transpose input 16x8 byte block - * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, - * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 - * (input 16x8 byte block) - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x16 byte block) - * Details : The rows of the matrix become columns, and the columns become - * rows. - * Example : LASX_TRANSPOSE16x8_H - * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 - * - * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 - * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 - * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 - * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 - * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 - * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 - * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 - * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - * ============================================================================= - */ -#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ - _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ - _out6, _out7) \ - { \ - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ - __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ - \ - _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ - _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ - _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ - _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ - _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ - _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ - _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ - _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ - _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ - _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ - _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ - \ - _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ - _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ - _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ - _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ - _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ - _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ - _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ - _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ - _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ - _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ - _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ - } - -/* - * ============================================================================= - * Description : Transpose 4x4 block with halfword elements in vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3 - * Outputs - _out0, _out1, _out2, _out3 - * Return Type - signed halfword - * Details : The rows of the matrix become columns, and the columns become - * rows. - * Example : See LASX_TRANSPOSE8x8_H - * ============================================================================= - */ -#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ - _out3) \ - { \ - __m256i _s0_m, _s1_m; \ - \ - _s0_m = __lasx_xvilvl_h(_in1, _in0); \ - _s1_m = __lasx_xvilvl_h(_in3, _in2); \ - _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ - _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ - _out1 = __lasx_xvilvh_d(_out0, _out0); \ - _out3 = __lasx_xvilvh_d(_out2, _out2); \ - } - -/* - * ============================================================================= - * Description : Transpose input 8x8 byte block - * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 - * (input 8x8 byte block) - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x8 byte block) - * Example : See LASX_TRANSPOSE8x8_H - * ============================================================================= - */ -#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ - _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ - _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ - _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ - _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ - _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ - _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ - _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ - _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ - _out1 = __lasx_xvbsrl_v(_out0, 8); \ - _out3 = __lasx_xvbsrl_v(_out2, 8); \ - _out5 = __lasx_xvbsrl_v(_out4, 8); \ - _out7 = __lasx_xvbsrl_v(_out6, 8); \ - } - -/* - * ============================================================================= - * Description : Transpose 8x8 block with halfword elements in vectors. - * Arguments : Inputs - _in0, _in1, ~ - * Outputs - _out0, _out1, ~ - * Details : The rows of the matrix become columns, and the columns become - * rows. - * Example : LASX_TRANSPOSE8x8_H - * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 - * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 - * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 - * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 - * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 - * - * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 - * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 - * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 - * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 - * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 - * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 - * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 - * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 - * ============================================================================= - */ -#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - __m256i _s0_m, _s1_m; \ - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ - \ - _s0_m = __lasx_xvilvl_h(_in6, _in4); \ - _s1_m = __lasx_xvilvl_h(_in7, _in5); \ - _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ - _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ - _s0_m = __lasx_xvilvh_h(_in6, _in4); \ - _s1_m = __lasx_xvilvh_h(_in7, _in5); \ - _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ - _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ - \ - _s0_m = __lasx_xvilvl_h(_in2, _in0); \ - _s1_m = __lasx_xvilvl_h(_in3, _in1); \ - _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ - _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ - _s0_m = __lasx_xvilvh_h(_in2, _in0); \ - _s1_m = __lasx_xvilvh_h(_in3, _in1); \ - _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ - _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ - \ - _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ - _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ - _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ - _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ - _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ - _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ - _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ - _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ - } - -/* - * ============================================================================= - * Description : Butterfly of 4 input vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3 - * Outputs - _out0, _out1, _out2, _out3 - * Details : Butterfly operation - * Example : LASX_BUTTERFLY_4 - * _out0 = _in0 + _in3; - * _out1 = _in1 + _in2; - * _out2 = _in1 - _in2; - * _out3 = _in0 - _in3; - * ============================================================================= - */ -#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lasx_xvadd_b(_in0, _in3); \ - _out1 = __lasx_xvadd_b(_in1, _in2); \ - _out2 = __lasx_xvsub_b(_in1, _in2); \ - _out3 = __lasx_xvsub_b(_in0, _in3); \ - } -#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lasx_xvadd_h(_in0, _in3); \ - _out1 = __lasx_xvadd_h(_in1, _in2); \ - _out2 = __lasx_xvsub_h(_in1, _in2); \ - _out3 = __lasx_xvsub_h(_in0, _in3); \ - } -#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lasx_xvadd_w(_in0, _in3); \ - _out1 = __lasx_xvadd_w(_in1, _in2); \ - _out2 = __lasx_xvsub_w(_in1, _in2); \ - _out3 = __lasx_xvsub_w(_in0, _in3); \ - } -#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ - _out0 = __lasx_xvadd_d(_in0, _in3); \ - _out1 = __lasx_xvadd_d(_in1, _in2); \ - _out2 = __lasx_xvsub_d(_in1, _in2); \ - _out3 = __lasx_xvsub_d(_in0, _in3); \ - } - -/* - * ============================================================================= - * Description : Butterfly of 8 input vectors - * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ - * Outputs - _out0, _out1, _out2, _out3, ~ - * Details : Butterfly operation - * Example : LASX_BUTTERFLY_8 - * _out0 = _in0 + _in7; - * _out1 = _in1 + _in6; - * _out2 = _in2 + _in5; - * _out3 = _in3 + _in4; - * _out4 = _in3 - _in4; - * _out5 = _in2 - _in5; - * _out6 = _in1 - _in6; - * _out7 = _in0 - _in7; - * ============================================================================= - */ -#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lasx_xvadd_b(_in0, _in7); \ - _out1 = __lasx_xvadd_b(_in1, _in6); \ - _out2 = __lasx_xvadd_b(_in2, _in5); \ - _out3 = __lasx_xvadd_b(_in3, _in4); \ - _out4 = __lasx_xvsub_b(_in3, _in4); \ - _out5 = __lasx_xvsub_b(_in2, _in5); \ - _out6 = __lasx_xvsub_b(_in1, _in6); \ - _out7 = __lasx_xvsub_b(_in0, _in7); \ - } - -#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lasx_xvadd_h(_in0, _in7); \ - _out1 = __lasx_xvadd_h(_in1, _in6); \ - _out2 = __lasx_xvadd_h(_in2, _in5); \ - _out3 = __lasx_xvadd_h(_in3, _in4); \ - _out4 = __lasx_xvsub_h(_in3, _in4); \ - _out5 = __lasx_xvsub_h(_in2, _in5); \ - _out6 = __lasx_xvsub_h(_in1, _in6); \ - _out7 = __lasx_xvsub_h(_in0, _in7); \ - } - -#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lasx_xvadd_w(_in0, _in7); \ - _out1 = __lasx_xvadd_w(_in1, _in6); \ - _out2 = __lasx_xvadd_w(_in2, _in5); \ - _out3 = __lasx_xvadd_w(_in3, _in4); \ - _out4 = __lasx_xvsub_w(_in3, _in4); \ - _out5 = __lasx_xvsub_w(_in2, _in5); \ - _out6 = __lasx_xvsub_w(_in1, _in6); \ - _out7 = __lasx_xvsub_w(_in0, _in7); \ - } - -#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ - _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ - _out7) \ - { \ - _out0 = __lasx_xvadd_d(_in0, _in7); \ - _out1 = __lasx_xvadd_d(_in1, _in6); \ - _out2 = __lasx_xvadd_d(_in2, _in5); \ - _out3 = __lasx_xvadd_d(_in3, _in4); \ - _out4 = __lasx_xvsub_d(_in3, _in4); \ - _out5 = __lasx_xvsub_d(_in2, _in5); \ - _out6 = __lasx_xvsub_d(_in1, _in6); \ - _out7 = __lasx_xvsub_d(_in0, _in7); \ - } - -#endif // LASX - -/* - * ============================================================================= - * Description : Print out elements in vector. - * Arguments : Inputs - RTYPE, _element_num, _in0, _enter - * Outputs - - * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if - * '_enter' is TRUE, prefix "\nVP:" will be added first. - * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 - * VP:1,2,3,4, - * ============================================================================= - */ -#define VECT_PRINT(RTYPE, element_num, in0, enter) \ - { \ - RTYPE _tmp0 = (RTYPE)in0; \ - int _i = 0; \ - if (enter) \ - printf("\nVP:"); \ - for (_i = 0; _i < element_num; _i++) \ - printf("%d,", _tmp0[_i]); \ - } - -#endif /* LOONGSON_INTRINSICS_H */ -#endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */ diff --git a/drivers/media/pci/tbscapture2/include/libyuv/macros_msa.h b/drivers/media/pci/tbscapture2/include/libyuv/macros_msa.h deleted file mode 100644 index 6434a4da0537..000000000000 --- a/drivers/media/pci/tbscapture2/include/libyuv/macros_msa.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ -#define INCLUDE_LIBYUV_MACROS_MSA_H_ - -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include -#include - -#if (__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ - uint32_t val_m; \ - asm("lw %[val_m], %[psrc_lw_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_lw_m] "m"(*psrc_lw_m)); \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint64_t val_m = 0; \ - asm("ld %[val_m], %[psrc_ld_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_ld_m] "m"(*psrc_ld_m)); \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - val0_m = LW(psrc_ld_m); \ - val1_m = LW(psrc_ld_m + 4); \ - val_m = (uint64_t)(val1_m); /* NOLINT */ \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SW(val, pdst) \ - ({ \ - uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val_m = (val); \ - asm("sw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ - }) - -#if (__mips == 64) -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint64_t val_m = (val); \ - asm("sd %[val_m], %[pdst_sd_m] \n" \ - : [pdst_sd_m] "=m"(*pdst_sd_m) \ - : [val_m] "r"(val_m)); \ - }) -#else // !(__mips == 64) -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val0_m, val1_m; \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - SW(val0_m, pdst_sd_m); \ - SW(val1_m, pdst_sd_m + 4); \ - }) -#endif // !(__mips == 64) -#else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - uint8_t* psrc_lw_m = (uint8_t*)(psrc); \ - uint32_t val_lw_m; \ - \ - asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ - "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ - \ - : [val_lw_m] "=&r"(val_lw_m) \ - : [psrc_lw_m] "r"(psrc_lw_m)); \ - \ - val_lw_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - uint8_t* psrc_ld_m = (uint8_t*)(psrc); \ - uint64_t val_ld_m = 0; \ - \ - asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ - "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ - \ - : [val_ld_m] "=&r"(val_ld_m) \ - : [psrc_ld_m] "r"(psrc_ld_m)); \ - \ - val_ld_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - val0_m = LW(psrc_ld_m); \ - val1_m = LW(psrc_ld_m + 4); \ - val_m = (uint64_t)(val1_m); /* NOLINT */ \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SW(val, pdst) \ - ({ \ - uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val_m = (val); \ - asm("usw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ - }) - -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val0_m, val1_m; \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - SW(val0_m, pdst_sd_m); \ - SW(val1_m, pdst_sd_m + 4); \ - }) -#endif // (__mips_isa_rev >= 6) - -// TODO(fbarchard): Consider removing __VAR_ARGS versions. -#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ -#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ -#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ -#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) - -/* Description : Load two vectors with 16 'byte' sized elements - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Load 16 byte elements in 'out0' from (psrc) - Load 16 byte elements in 'out1' from (psrc + stride) -*/ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ - } -#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) - -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) - -/* Description : Store two vectors with stride each having 16 'byte' sized - elements - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 16 byte elements from 'in0' to (pdst) - Store 16 byte elements from 'in1' to (pdst + stride) -*/ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) - -// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. -/* Description : Shuffle byte vector elements as per mask vector - Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0' & 'in1' are copied selectively to - 'out0' as per control vector 'mask0' -*/ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ - } -#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) - -/* Description : Interleave both left and right half of input vectors - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and written to 'out0' -*/ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - } -#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) - -#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ - -#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/drivers/media/pci/tbscapture2/other.c b/drivers/media/pci/tbscapture2/other.c index 95af11d67664..de1152d3bfc3 100644 --- a/drivers/media/pci/tbscapture2/other.c +++ b/drivers/media/pci/tbscapture2/other.c @@ -1,15 +1,15 @@ -#include -#include "tbs_pcie-reg.h" -#include "tbs_pcie.h" -void *malloc(size_t __size); -void *malloc(size_t __size) -{ - return kzalloc(__size, GFP_KERNEL); -} -void free(void *__ptr); -void free(void *__ptr) -{ - if(__ptr) - kfree(__ptr); -} - +#include +#include "tbs_pcie-reg.h" +#include "tbs_pcie.h" +void *malloc(size_t __size); +void *malloc(size_t __size) +{ + return kzalloc(__size, GFP_KERNEL); +} +void free(void *__ptr); +void free(void *__ptr) +{ + if(__ptr) + kfree(__ptr); +} + diff --git a/drivers/media/pci/tbscapture2/rotate_lsx.c b/drivers/media/pci/tbscapture2/rotate_lsx.c deleted file mode 100644 index 0edf45300f96..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_lsx.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright 2022 The LibYuv Project Authors. All rights reserved. - * - * Copyright (c) 2022 Loongson Technology Corporation Limited - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" - -#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) -#include "loongson_intrinsics.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \ - DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \ - } - -#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \ - DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \ - } - -#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \ - DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \ - } - -#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \ - DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \ - } - -#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \ - _stride3, _stride4) \ - { \ - __lsx_vst(_dst0, _dst, 0); \ - __lsx_vstx(_dst1, _dst, _stride); \ - __lsx_vstx(_dst2, _dst, _stride2); \ - __lsx_vstx(_dst3, _dst, _stride3); \ - _dst += _stride4; \ - } - -#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \ - { \ - __lsx_vst(_dst0, _dst, 0); \ - __lsx_vstx(_dst1, _dst, _stride); \ - _dst += _stride2; \ - } - -void TransposeUVWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width); - TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), - dst_stride_a, (dst_b + 8), dst_stride_b, width); -} - -void TransposeWx16_LSX(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - int x; - int len = width / 16; - uint8_t* s; - int src_stride2 = src_stride << 1; - int src_stride3 = src_stride + src_stride2; - int src_stride4 = src_stride2 << 1; - int dst_stride2 = dst_stride << 1; - int dst_stride3 = dst_stride + dst_stride2; - int dst_stride4 = dst_stride2 << 1; - __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < len; x++) { - s = (uint8_t*)src; - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); - ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); - res8 = __lsx_vilvl_w(reg4, reg0); - res9 = __lsx_vilvh_w(reg4, reg0); - ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, - dst_stride4); - res8 = __lsx_vilvl_w(reg5, reg1); - res9 = __lsx_vilvh_w(reg5, reg1); - ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, - dst_stride4); - res8 = __lsx_vilvl_w(reg6, reg2); - res9 = __lsx_vilvh_w(reg6, reg2); - ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, - dst_stride4); - res8 = __lsx_vilvl_w(reg7, reg3); - res9 = __lsx_vilvh_w(reg7, reg3); - ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, - dst_stride4); - src += 16; - } -} - -void TransposeUVWx16_LSX(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - int x; - int len = width / 8; - uint8_t* s; - int src_stride2 = src_stride << 1; - int src_stride3 = src_stride + src_stride2; - int src_stride4 = src_stride2 << 1; - int dst_stride_a2 = dst_stride_a << 1; - int dst_stride_b2 = dst_stride_b << 1; - __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < len; x++) { - s = (uint8_t*)src; - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); - ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); - src0 = __lsx_vld(s, 0); - src1 = __lsx_vldx(s, src_stride); - src2 = __lsx_vldx(s, src_stride2); - src3 = __lsx_vldx(s, src_stride3); - s += src_stride4; - ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); - ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); - res8 = __lsx_vilvl_w(reg4, reg0); - res9 = __lsx_vilvh_w(reg4, reg0); - ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); - LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); - res8 = __lsx_vilvl_w(reg5, reg1); - res9 = __lsx_vilvh_w(reg5, reg1); - ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); - LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); - res8 = __lsx_vilvl_w(reg6, reg2); - res9 = __lsx_vilvh_w(reg6, reg2); - ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); - LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); - res8 = __lsx_vilvl_w(reg7, reg3); - res9 = __lsx_vilvh_w(reg7, reg3); - ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); - LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); - src += 16; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/drivers/media/pci/tbscapture2/rotate_msa.c b/drivers/media/pci/tbscapture2/rotate_msa.c deleted file mode 100644 index cb8cb8f862a3..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_msa.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ - out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ - out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ - out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ - } - -#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ - out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ - out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ - out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ - } - -#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ - out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ - out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ - out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ - } - -#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ - out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ - } - -void TransposeUVWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width); - TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), - dst_stride_a, (dst_b + 8), dst_stride_b, width); -} - -void TransposeWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - int x; - const uint8_t* s; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < width; x += 16) { - s = src; - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); - ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); - ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); - ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); - ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - src += 16; - dst += dst_stride * 4; - } -} - -void TransposeUVWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - int x; - const uint8_t* s; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < width; x += 8) { - s = src; - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); - ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); - ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); - ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); - ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - src += 16; - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/drivers/media/pci/tbscapture2/rotate_neon.c b/drivers/media/pci/tbscapture2/rotate_neon.c deleted file mode 100644 index fa873a29c073..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_neon.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" -#include "row.h" - -#include "basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -void TransposeWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %[width], #8 \n" - - "1: \n" - "mov %[temp], %[src] \n" - "vld1.8 {d0}, [%[temp]], %[src_stride] \n" - "vld1.8 {d1}, [%[temp]], %[src_stride] \n" - "vld1.8 {d2}, [%[temp]], %[src_stride] \n" - "vld1.8 {d3}, [%[temp]], %[src_stride] \n" - "vld1.8 {d4}, [%[temp]], %[src_stride] \n" - "vld1.8 {d5}, [%[temp]], %[src_stride] \n" - "vld1.8 {d6}, [%[temp]], %[src_stride] \n" - "vld1.8 {d7}, [%[temp]] \n" - "add %[src], #8 \n" - - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - "subs %[width], #8 \n" - - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" - - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - - "mov %[temp], %[dst] \n" - "vst1.8 {d1}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d0}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d3}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d2}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d5}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d4}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d7}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d6}, [%[temp]] \n" - "add %[dst], %[dst], %[dst_stride], lsl #3 \n" - - "bge 1b \n" - : [temp] "=&r"(temp), // %[temp] - [src] "+r"(src), // %[src] - [dst] "+r"(dst), // %[dst] - [width] "+r"(width) // %[width] - : [src_stride] "r"(src_stride), // %[src_stride] - [dst_stride] "r"(dst_stride) // %[dst_stride] - : "memory", "cc", "q0", "q1", "q2", "q3"); -} - -void TransposeUVWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - const uint8_t* temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %[width], #8 \n" - - "1: \n" - "mov %[temp], %[src] \n" - "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" - "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" - "vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n" - "vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n" - "vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n" - "vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n" - "vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n" - "vld2.8 {d22, d23}, [%[temp]] \n" - "add %[src], #8*2 \n" - - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - "subs %[width], #8 \n" - - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" - - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" - - "mov %[temp], %[dst_a] \n" - "vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d20}, [%[temp]] \n" - "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" - - "mov %[temp], %[dst_b] \n" - "vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d21}, [%[temp]] \n" - "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" - - "bge 1b \n" - : [temp] "=&r"(temp), // %[temp] - [src] "+r"(src), // %[src] - [dst_a] "+r"(dst_a), // %[dst_a] - [dst_b] "+r"(dst_b), // %[dst_b] - [width] "+r"(width) // %[width] - : [src_stride] "r"(src_stride), // %[src_stride] - [dst_stride_a] "r"(dst_stride_a), // %[dst_stride_a] - [dst_stride_b] "r"(dst_stride_b) // %[dst_stride_b] - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} - -// Transpose 32 bit values (ARGB) -void Transpose4x4_32_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* src1 = src + src_stride; - const uint8_t* src2 = src1 + src_stride; - const uint8_t* src3 = src2 + src_stride; - uint8_t* dst1 = dst + dst_stride; - uint8_t* dst2 = dst1 + dst_stride; - uint8_t* dst3 = dst2 + dst_stride; - asm volatile ( - // Main loop transpose 4x4. Read a column, write a row. - "1: \n" - "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" - "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n" - "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n" - "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n" - "subs %8, %8, #4 \n" // w -= 4 - "vst1.8 {q0}, [%4]! \n" - "vst1.8 {q1}, [%5]! \n" - "vst1.8 {q2}, [%6]! \n" - "vst1.8 {q3}, [%7]! \n" - "bgt 1b \n" - - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(dst1), // %5 - "+r"(dst2), // %6 - "+r"(dst3), // %7 - "+r"(width) // %8 - : "r"((ptrdiff_t)(src_stride * 4)) // %9 - : "memory", "cc", "q0", "q1", "q2", "q3"); -} - -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/rotate_neon64.c b/drivers/media/pci/tbscapture2/rotate_neon64.c deleted file mode 100644 index d7dc71de1605..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_neon64.c +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" -#include "row.h" - -#include "basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -void TransposeWx16_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* src_temp; - asm volatile ( - "1: \n" - "mov %[src_temp], %[src] \n" - - "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n" - - "add %[src], %[src], #16 \n" - - // Transpose bytes within each 2x2 block. - "trn1 v0.16b, v16.16b, v17.16b \n" - "trn2 v1.16b, v16.16b, v17.16b \n" - "trn1 v2.16b, v18.16b, v19.16b \n" - "trn2 v3.16b, v18.16b, v19.16b \n" - "trn1 v4.16b, v20.16b, v21.16b \n" - "trn2 v5.16b, v20.16b, v21.16b \n" - "trn1 v6.16b, v22.16b, v23.16b \n" - "trn2 v7.16b, v22.16b, v23.16b \n" - "trn1 v8.16b, v24.16b, v25.16b \n" - "trn2 v9.16b, v24.16b, v25.16b \n" - "trn1 v10.16b, v26.16b, v27.16b \n" - "trn2 v11.16b, v26.16b, v27.16b \n" - "trn1 v12.16b, v28.16b, v29.16b \n" - "trn2 v13.16b, v28.16b, v29.16b \n" - "trn1 v14.16b, v30.16b, v31.16b \n" - "trn2 v15.16b, v30.16b, v31.16b \n" - - // Transpose 2x2-byte blocks within each 4x4 block. - "trn1 v16.8h, v0.8h, v2.8h \n" - "trn1 v17.8h, v1.8h, v3.8h \n" - "trn2 v18.8h, v0.8h, v2.8h \n" - "trn2 v19.8h, v1.8h, v3.8h \n" - "trn1 v20.8h, v4.8h, v6.8h \n" - "trn1 v21.8h, v5.8h, v7.8h \n" - "trn2 v22.8h, v4.8h, v6.8h \n" - "trn2 v23.8h, v5.8h, v7.8h \n" - "trn1 v24.8h, v8.8h, v10.8h \n" - "trn1 v25.8h, v9.8h, v11.8h \n" - "trn2 v26.8h, v8.8h, v10.8h \n" - "trn2 v27.8h, v9.8h, v11.8h \n" - "trn1 v28.8h, v12.8h, v14.8h \n" - "trn1 v29.8h, v13.8h, v15.8h \n" - "trn2 v30.8h, v12.8h, v14.8h \n" - "trn2 v31.8h, v13.8h, v15.8h \n" - - "subs %w[width], %w[width], #16 \n" - - // Transpose 4x4-byte blocks within each 8x8 block. - "trn1 v0.4s, v16.4s, v20.4s \n" - "trn1 v2.4s, v17.4s, v21.4s \n" - "trn1 v4.4s, v18.4s, v22.4s \n" - "trn1 v6.4s, v19.4s, v23.4s \n" - "trn2 v8.4s, v16.4s, v20.4s \n" - "trn2 v10.4s, v17.4s, v21.4s \n" - "trn2 v12.4s, v18.4s, v22.4s \n" - "trn2 v14.4s, v19.4s, v23.4s \n" - "trn1 v1.4s, v24.4s, v28.4s \n" - "trn1 v3.4s, v25.4s, v29.4s \n" - "trn1 v5.4s, v26.4s, v30.4s \n" - "trn1 v7.4s, v27.4s, v31.4s \n" - "trn2 v9.4s, v24.4s, v28.4s \n" - "trn2 v11.4s, v25.4s, v29.4s \n" - "trn2 v13.4s, v26.4s, v30.4s \n" - "trn2 v15.4s, v27.4s, v31.4s \n" - - // Transpose 8x8-byte blocks and store. - "st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n" - - "b.gt 1b \n" - : [src] "+r"(src), // %[src] - [src_temp] "=&r"(src_temp), // %[src_temp] - [dst] "+r"(dst), // %[dst] - [width] "+r"(width) // %[width] - : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] - [dst_stride] "r"((ptrdiff_t)dst_stride) // %[dst_stride] - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", - "v29", "v30", "v31"); -} - -void TransposeUVWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - const uint8_t* temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w[width], %w[width], #8 \n" - - "1: \n" - "mov %[temp], %[src] \n" - "ld1 {v0.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v1.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v2.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v3.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v4.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v5.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v6.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v7.16b}, [%[temp]] \n" - "add %[src], %[src], #16 \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "subs %w[width], %w[width], #8 \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" - - "mov %[temp], %[dst_a] \n" - "st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v19.d}[1], [%[temp]] \n" - "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" - - "mov %[temp], %[dst_b] \n" - "st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v23.d}[1], [%[temp]] \n" - "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" - - "b.ge 1b \n" - : [temp] "=&r"(temp), // %[temp] - [src] "+r"(src), // %[src] - [dst_a] "+r"(dst_a), // %[dst_a] - [dst_b] "+r"(dst_b), // %[dst_b] - [width] "+r"(width) // %[width] - : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] - [dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a] - [dst_stride_b] "r"((ptrdiff_t)dst_stride_b) // %[dst_stride_b] - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); -} - -// Transpose 32 bit values (ARGB) -void Transpose4x4_32_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* src1 = src + src_stride; - const uint8_t* src2 = src1 + src_stride; - const uint8_t* src3 = src2 + src_stride; - uint8_t* dst1 = dst + dst_stride; - uint8_t* dst2 = dst1 + dst_stride; - uint8_t* dst3 = dst2 + dst_stride; - asm volatile ( - // Main loop transpose 4x4. Read a column, write a row. - "1: \n" - "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" - "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" - "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" - "ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n" - "subs %w8, %w8, #4 \n" // w -= 4 - "st1 {v0.4s}, [%4], 16 \n" - "st1 {v1.4s}, [%5], 16 \n" - "st1 {v2.4s}, [%6], 16 \n" - "st1 {v3.4s}, [%7], 16 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(dst1), // %5 - "+r"(dst2), // %6 - "+r"(dst3), // %7 - "+r"(width) // %8 - : "r"((ptrdiff_t)(src_stride * 4)) // %9 - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/rotate_sme.c b/drivers/media/pci/tbscapture2/rotate_sme.c deleted file mode 100644 index d2323146c52c..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_sme.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright 2024 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" -#include "row.h" - -#include "basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ - defined(__aarch64__) - -__arm_locally_streaming __arm_new("za") void TransposeWxH_SME( - const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - int vl; - asm("cntb %x0" : "=r"(vl)); - - do { - const uint8_t* src2 = src; - uint8_t* dst2 = dst; - - // Process up to VL elements per iteration of the inner loop. - int block_height = height > vl ? vl : height; - - int width2 = width; - do { - const uint8_t* src3 = src2; - - // Process up to VL elements per iteration of the inner loop. - int block_width = width2 > vl ? vl : width2; - - asm volatile( - "mov w12, #0 \n" - - // Create a predicate to handle loading partial rows. - "whilelt p0.b, wzr, %w[block_width] \n" - - // Load H <= VL rows into ZA0. - "1: \n" - "ld1b {za0h.b[w12, 0]}, p0/z, [%[src3]] \n" - "add %[src3], %[src3], %[src_stride] \n" - "add w12, w12, #1 \n" - "cmp w12, %w[block_height] \n" - "b.ne 1b \n" - - // Create a predicate to handle storing partial columns. - "whilelt p0.b, wzr, %w[block_height] \n" - "mov w12, #0 \n" - - // Store W <= VL columns from ZA0. - "2: \n" - "st1b {za0v.b[w12, 0]}, p0, [%[dst2]] \n" - "add %[dst2], %[dst2], %[dst_stride] \n" - "add w12, w12, #1 \n" - "cmp w12, %w[block_width] \n" - "b.ne 2b \n" - : [src3] "+r"(src3), // %[src3] - [dst2] "+r"(dst2) // %[dst2] - : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] - [dst_stride] "r"((ptrdiff_t)dst_stride), // %[dst_stride] - [block_width] "r"(block_width), // %[block_width] - [block_height] "r"(block_height) // %[block_height] - : "cc", "memory", "p0", "w12", "za"); - - src2 += vl; - width2 -= vl; - } while (width2 > 0); - - src += vl * src_stride; - dst += vl; - height -= vl; - } while (height > 0); -} - -__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME( - const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int vl; - asm("cnth %x0" : "=r"(vl)); - - do { - const uint8_t* src2 = src; - uint8_t* dst2_a = dst_a; - uint8_t* dst2_b = dst_b; - - // Process up to VL bytes per iteration of the inner loop. - int block_height = height > vl * 2 ? vl * 2 : height; - - int width2 = width; - do { - const uint8_t* src3 = src2; - - // Process up to VL 16-bit elements per iteration of the inner loop. - int block_width = width2 > vl ? vl : width2; - - asm volatile( - "mov w12, #0 \n" - - // Create a predicate to handle loading partial rows, - // %[block_width] is always a multiple of two here. - "whilelt p0.b, wzr, %w[block_width] \n" - - // Load H <= VL rows into ZA0, such that U/V components exist in - // alternating columns. - "1: \n" - "ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n" - "add %[src], %[src], %[src_stride] \n" - "add w12, w12, #1 \n" - "cmp w12, %w[block_height] \n" - "b.ne 1b \n" - - // Create a predicate to handle storing partial columns. - "whilelt p0.b, wzr, %w[block_height] \n" - "mov w12, #0 \n" - - // Store alternating UV data from pairs of ZA0 columns. - "2: \n" - "st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n" - "st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n" - "add %[dst_a], %[dst_a], %[dst_stride_a] \n" - "add %[dst_b], %[dst_b], %[dst_stride_b] \n" - "add w12, w12, #2 \n" - "cmp w12, %w[block_width] \n" - "b.ne 2b \n" - : [src] "+r"(src3), // %[src] - [dst_a] "+r"(dst2_a), // %[dst_a] - [dst_b] "+r"(dst2_b) // %[dst_b] - : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] - [dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a] - [dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b] - [block_width] "r"(block_width * 2), // %[block_width] - [block_height] "r"(block_height) // %[block_height] - : "cc", "memory", "p0", "w12", "za"); - - src2 += 2 * vl; - width2 -= vl; - } while (width2 > 0); - - src += 2 * vl * src_stride; - dst_a += 2 * vl; - dst_b += 2 * vl; - height -= 2 * vl; - } while (height > 0); -} - -#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && - // defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/rotate_win.c b/drivers/media/pci/tbscapture2/rotate_win.c deleted file mode 100644 index 667eba922e88..000000000000 --- a/drivers/media/pci/tbscapture2/rotate_win.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - - pop ebp - pop esi - pop edi - ret - } -} - -__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - // Read in the data from the source pointer. - // First round of bit swap. - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/row_lasx.c b/drivers/media/pci/tbscapture2/row_lasx.c deleted file mode 100644 index e7e7dac0ba06..000000000000 --- a/drivers/media/pci/tbscapture2/row_lasx.c +++ /dev/null @@ -1,2304 +0,0 @@ -/* - * Copyright 2022 The LibYuv Project Authors. All rights reserved. - * - * Copyright (c) 2022 Loongson Technology Corporation Limited - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) -#include "loongson_intrinsics.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ALPHA_VAL (-1) - -// Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ - { \ - ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ - vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ - ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ - vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ - yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ - yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ - } - -// Load 32 YUV422 pixel data -#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ - { \ - __m256i temp0, temp1; \ - \ - DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ - temp1 = __lasx_xvld(psrc_v, 0); \ - temp0 = __lasx_xvsub_b(temp0, const_0x80); \ - temp1 = __lasx_xvsub_b(temp1, const_0x80); \ - temp0 = __lasx_vext2xv_h_b(temp0); \ - temp1 = __lasx_vext2xv_h_b(temp1); \ - uv_l = __lasx_xvilvl_h(temp0, temp1); \ - uv_h = __lasx_xvilvh_h(temp0, temp1); \ - } - -// Load 16 YUV422 pixel data -#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ - { \ - __m256i temp0, temp1; \ - \ - out_y = __lasx_xvld(psrc_y, 0); \ - temp0 = __lasx_xvldrepl_d(psrc_u, 0); \ - temp1 = __lasx_xvldrepl_d(psrc_v, 0); \ - uv = __lasx_xvilvl_b(temp0, temp1); \ - uv = __lasx_xvsub_b(uv, const_0x80); \ - uv = __lasx_vext2xv_h_b(uv); \ - } - -// Convert 16 pixels of YUV420 to RGB. -#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ - g_h, r_l, r_h) \ - { \ - __m256i u_l, u_h, v_l, v_h; \ - __m256i yl_ev, yl_od, yh_ev, yh_od; \ - __m256i temp0, temp1, temp2, temp3; \ - \ - temp0 = __lasx_xvilvl_b(in_y, in_y); \ - temp1 = __lasx_xvilvh_b(in_y, in_y); \ - yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ - yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ - yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \ - yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \ - DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ - yl_ev, yl_od, yh_ev, yh_od); \ - yl_ev = __lasx_xvadd_w(yl_ev, yb); \ - yl_od = __lasx_xvadd_w(yl_od, yb); \ - yh_ev = __lasx_xvadd_w(yh_ev, yb); \ - yh_od = __lasx_xvadd_w(yh_od, yb); \ - v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \ - u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \ - v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \ - u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \ - temp0 = __lasx_xvadd_w(yl_ev, u_l); \ - temp1 = __lasx_xvadd_w(yl_od, u_l); \ - temp2 = __lasx_xvadd_w(yh_ev, u_h); \ - temp3 = __lasx_xvadd_w(yh_od, u_h); \ - DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - b_l = __lasx_xvpackev_h(temp1, temp0); \ - b_h = __lasx_xvpackev_h(temp3, temp2); \ - temp0 = __lasx_xvadd_w(yl_ev, v_l); \ - temp1 = __lasx_xvadd_w(yl_od, v_l); \ - temp2 = __lasx_xvadd_w(yh_ev, v_h); \ - temp3 = __lasx_xvadd_w(yh_od, v_h); \ - DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - r_l = __lasx_xvpackev_h(temp1, temp0); \ - r_h = __lasx_xvpackev_h(temp3, temp2); \ - DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ - temp0 = __lasx_xvsub_w(yl_ev, u_l); \ - temp1 = __lasx_xvsub_w(yl_od, u_l); \ - temp2 = __lasx_xvsub_w(yh_ev, u_h); \ - temp3 = __lasx_xvsub_w(yh_od, u_h); \ - DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - g_l = __lasx_xvpackev_h(temp1, temp0); \ - g_h = __lasx_xvpackev_h(temp3, temp2); \ - } - -// Convert 8 pixels of YUV420 to RGB. -#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ - { \ - __m256i u_l, v_l, yl_ev, yl_od; \ - __m256i temp0, temp1; \ - \ - in_y = __lasx_xvpermi_d(in_y, 0xD8); \ - temp0 = __lasx_xvilvl_b(in_y, in_y); \ - yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ - yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ - DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \ - yl_ev = __lasx_xvadd_w(yl_ev, yb); \ - yl_od = __lasx_xvadd_w(yl_od, yb); \ - v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \ - u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \ - temp0 = __lasx_xvadd_w(yl_ev, u_l); \ - temp1 = __lasx_xvadd_w(yl_od, u_l); \ - DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ - DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ - out_b = __lasx_xvpackev_h(temp1, temp0); \ - temp0 = __lasx_xvadd_w(yl_ev, v_l); \ - temp1 = __lasx_xvadd_w(yl_od, v_l); \ - DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ - DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ - out_r = __lasx_xvpackev_h(temp1, temp0); \ - u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \ - temp0 = __lasx_xvsub_w(yl_ev, u_l); \ - temp1 = __lasx_xvsub_w(yl_od, u_l); \ - DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ - DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ - out_g = __lasx_xvpackev_h(temp1, temp0); \ - } - -// Pack and Store 16 ARGB values. -#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ - { \ - __m256i temp0, temp1, temp2, temp3; \ - \ - temp0 = __lasx_xvpackev_b(g_l, b_l); \ - temp1 = __lasx_xvpackev_b(a_l, r_l); \ - temp2 = __lasx_xvpackev_b(g_h, b_h); \ - temp3 = __lasx_xvpackev_b(a_h, r_h); \ - r_l = __lasx_xvilvl_h(temp1, temp0); \ - r_h = __lasx_xvilvh_h(temp1, temp0); \ - g_l = __lasx_xvilvl_h(temp3, temp2); \ - g_h = __lasx_xvilvh_h(temp3, temp2); \ - temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \ - temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \ - temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \ - temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \ - __lasx_xvst(temp0, pdst_argb, 0); \ - __lasx_xvst(temp1, pdst_argb, 32); \ - __lasx_xvst(temp2, pdst_argb, 64); \ - __lasx_xvst(temp3, pdst_argb, 96); \ - pdst_argb += 128; \ - } - -// Pack and Store 8 ARGB values. -#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ - { \ - __m256i temp0, temp1, temp2, temp3; \ - \ - temp0 = __lasx_xvpackev_b(in_g, in_b); \ - temp1 = __lasx_xvpackev_b(in_a, in_r); \ - temp2 = __lasx_xvilvl_h(temp1, temp0); \ - temp3 = __lasx_xvilvh_h(temp1, temp0); \ - temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \ - temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \ - __lasx_xvst(temp0, pdst_argb, 0); \ - __lasx_xvst(temp1, pdst_argb, 32); \ - pdst_argb += 64; \ - } - -#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ - { \ - __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ - _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ - _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ - _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ - _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ - _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ - _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ - _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ - _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ - _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ - _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ - _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ - _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ - _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ - _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ - _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ - } - -void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { - int x; - int len = width / 64; - __m256i src0, src1; - __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607, - 0x08090A0B0C0D0E0F, 0x0001020304050607}; - src += width - 64; - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); - DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, - src1); - src0 = __lasx_xvpermi_q(src0, src0, 0x01); - src1 = __lasx_xvpermi_q(src1, src1, 0x01); - __lasx_xvst(src1, dst, 0); - __lasx_xvst(src0, dst, 32); - dst += 64; - src -= 64; - } -} - -void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - int x; - int len = width / 16; - __m256i src, dst; - __m256i shuffler = {0x0004000500060007, 0x0000000100020003, - 0x0004000500060007, 0x0000000100020003}; - - src_uv += (width - 16) << 1; - for (x = 0; x < len; x++) { - src = __lasx_xvld(src_uv, 0); - dst = __lasx_xvshuf_h(shuffler, src, src); - dst = __lasx_xvpermi_q(dst, dst, 0x01); - __lasx_xvst(dst, dst_uv, 0); - src_uv -= 32; - dst_uv += 32; - } -} - -void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { - int x; - int len = width / 16; - __m256i src0, src1; - __m256i dst0, dst1; - __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504, - 0x0B0A09080F0E0D0C, 0x0302010007060504}; - src += (width * 4) - 64; - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); - DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, - src1); - dst1 = __lasx_xvpermi_q(src0, src0, 0x01); - dst0 = __lasx_xvpermi_q(src1, src1, 0x01); - __lasx_xvst(dst0, dst, 0); - __lasx_xvst(dst1, dst, 32); - dst += 64; - src -= 64; - } -} - -void I422ToYUY2Row_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - int x; - int len = width / 32; - __m256i src_u0, src_v0, src_y0, vec_uv0; - __m256i vec_yuy2_0, vec_yuy2_1; - __m256i dst_yuy2_0, dst_yuy2_1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); - src_y0 = __lasx_xvld(src_y, 0); - src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); - src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); - vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); - vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0); - vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0); - dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20); - dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31); - __lasx_xvst(dst_yuy2_0, dst_yuy2, 0); - __lasx_xvst(dst_yuy2_1, dst_yuy2, 32); - src_u += 16; - src_v += 16; - src_y += 32; - dst_yuy2 += 64; - } -} - -void I422ToUYVYRow_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - int x; - int len = width / 32; - __m256i src_u0, src_v0, src_y0, vec_uv0; - __m256i vec_uyvy0, vec_uyvy1; - __m256i dst_uyvy0, dst_uyvy1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); - src_y0 = __lasx_xvld(src_y, 0); - src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); - src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); - vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); - vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0); - vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0); - dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20); - dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31); - __lasx_xvst(dst_uyvy0, dst_uyvy, 0); - __lasx_xvst(dst_uyvy1, dst_uyvy, 32); - src_u += 16; - src_v += 16; - src_y += 32; - dst_uyvy += 64; - } -} - -void I422ToARGBRow_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i alpha = __lasx_xvldi(0xFF); - __m256i const_0x80 = __lasx_xvldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -void I422ToRGBARow_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i alpha = __lasx_xvldi(0xFF); - __m256i const_0x80 = __lasx_xvldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -void I422AlphaToARGBRow_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - int res = width & 31; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i zero = __lasx_xvldi(0); - __m256i const_0x80 = __lasx_xvldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; - - y = __lasx_xvld(src_a, 0); - a_l = __lasx_xvilvl_b(zero, y); - a_h = __lasx_xvilvh_b(zero, y); - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); - src_y += 32; - src_u += 16; - src_v += 16; - src_a += 32; - } - if (res) { - __m256i y, uv, r, g, b, a; - a = __lasx_xvld(src_a, 0); - a = __lasx_vext2xv_hu_bu(a); - READYUV422(src_y, src_u, src_v, y, uv); - YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); - STOREARGB(a, r, g, b, dst_argb); - } -} - -void I422ToRGB24Row_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int32_t width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614, - 0x0504120302100100, 0x0A18090816070614}; - __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, - 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - __m256i temp0, temp1, temp2, temp3; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - temp0 = __lasx_xvpackev_b(g_l, b_l); - temp1 = __lasx_xvpackev_b(g_h, b_h); - DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, - r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, - temp1); - - b_l = __lasx_xvilvl_d(temp1, temp2); - b_h = __lasx_xvilvh_d(temp3, temp1); - temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20); - temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30); - temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31); - __lasx_xvst(temp1, dst_argb, 0); - __lasx_xvst(temp2, dst_argb, 32); - __lasx_xvst(temp3, dst_argb, 64); - dst_argb += 96; - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void I422ToRGB565Row_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i const_0x80 = __lasx_xvldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - __m256i dst_l, dst_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lasx_xvsrli_h(b_l, 3); - b_h = __lasx_xvsrli_h(b_h, 3); - g_l = __lasx_xvsrli_h(g_l, 2); - g_h = __lasx_xvsrli_h(g_h, 2); - r_l = __lasx_xvsrli_h(r_l, 3); - r_h = __lasx_xvsrli_h(r_h, 3); - r_l = __lasx_xvslli_h(r_l, 11); - r_h = __lasx_xvslli_h(r_h, 11); - g_l = __lasx_xvslli_h(g_l, 5); - g_h = __lasx_xvslli_h(g_h, 5); - r_l = __lasx_xvor_v(r_l, g_l); - r_l = __lasx_xvor_v(r_l, b_l); - r_h = __lasx_xvor_v(r_h, g_h); - r_h = __lasx_xvor_v(r_h, b_h); - dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); - dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); - __lasx_xvst(dst_l, dst_rgb565, 0); - __lasx_xvst(dst_h, dst_rgb565, 32); - dst_rgb565 += 64; - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. -void I422ToARGB4444Row_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = (__m256i)v4u64{0xF000F000F000F000, 0xF000F000F000F000, - 0xF000F000F000F000, 0xF000F000F000F000}; - __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, - 0x00F000F000F000F0}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - __m256i dst_l, dst_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lasx_xvsrli_h(b_l, 4); - b_h = __lasx_xvsrli_h(b_h, 4); - r_l = __lasx_xvsrli_h(r_l, 4); - r_h = __lasx_xvsrli_h(r_h, 4); - g_l = __lasx_xvand_v(g_l, mask); - g_h = __lasx_xvand_v(g_h, mask); - r_l = __lasx_xvslli_h(r_l, 8); - r_h = __lasx_xvslli_h(r_h, 8); - r_l = __lasx_xvor_v(r_l, alpha); - r_h = __lasx_xvor_v(r_h, alpha); - r_l = __lasx_xvor_v(r_l, g_l); - r_h = __lasx_xvor_v(r_h, g_h); - r_l = __lasx_xvor_v(r_l, b_l); - r_h = __lasx_xvor_v(r_h, b_h); - dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); - dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); - __lasx_xvst(dst_l, dst_argb4444, 0); - __lasx_xvst(dst_h, dst_argb4444, 32); - dst_argb4444 += 64; - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -void I422ToARGB1555Row_LASX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 32; - __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg; - __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000, - 0x8000800080008000, 0x8000800080008000}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - __m256i dst_l, dst_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lasx_xvsrli_h(b_l, 3); - b_h = __lasx_xvsrli_h(b_h, 3); - g_l = __lasx_xvsrli_h(g_l, 3); - g_h = __lasx_xvsrli_h(g_h, 3); - g_l = __lasx_xvslli_h(g_l, 5); - g_h = __lasx_xvslli_h(g_h, 5); - r_l = __lasx_xvsrli_h(r_l, 3); - r_h = __lasx_xvsrli_h(r_h, 3); - r_l = __lasx_xvslli_h(r_l, 10); - r_h = __lasx_xvslli_h(r_h, 10); - r_l = __lasx_xvor_v(r_l, alpha); - r_h = __lasx_xvor_v(r_h, alpha); - r_l = __lasx_xvor_v(r_l, g_l); - r_h = __lasx_xvor_v(r_h, g_h); - r_l = __lasx_xvor_v(r_l, b_l); - r_h = __lasx_xvor_v(r_h, b_h); - dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); - dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); - __lasx_xvst(dst_l, dst_argb1555, 0); - __lasx_xvst(dst_h, dst_argb1555, 32); - dst_argb1555 += 64; - src_y += 32; - src_u += 16; - src_v += 16; - } -} - -void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); - dst0 = __lasx_xvpickev_b(src1, src0); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_y, 0); - src_yuy2 += 64; - dst_y += 32; - } -} - -void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; - int x; - int len = width / 32; - __m256i src0, src1, src2, src3; - __m256i tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0, - src_yuy2_next, 32, src0, src1, src2, src3); - src0 = __lasx_xvpickod_b(src1, src0); - src1 = __lasx_xvpickod_b(src3, src2); - tmp0 = __lasx_xvavgr_bu(src1, src0); - tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); - dst0 = __lasx_xvpickev_b(tmp0, tmp0); - dst1 = __lasx_xvpickod_b(tmp0, tmp0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst1, dst_v, 0, 0); - __lasx_xvstelm_d(dst1, dst_v, 8, 2); - src_yuy2 += 64; - src_yuy2_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - __m256i src0, src1, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); - tmp0 = __lasx_xvpickod_b(src1, src0); - tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); - dst0 = __lasx_xvpickev_b(tmp0, tmp0); - dst1 = __lasx_xvpickod_b(tmp0, tmp0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst1, dst_v, 0, 0); - __lasx_xvstelm_d(dst1, dst_v, 8, 2); - src_yuy2 += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); - dst0 = __lasx_xvpickod_b(src1, src0); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_y, 0); - src_uyvy += 64; - dst_y += 32; - } -} - -void UYVYToUVRow_LASX(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; - int x; - int len = width / 32; - __m256i src0, src1, src2, src3, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0, - src_uyvy_next, 32, src0, src1, src2, src3); - src0 = __lasx_xvpickev_b(src1, src0); - src1 = __lasx_xvpickev_b(src3, src2); - tmp0 = __lasx_xvavgr_bu(src1, src0); - tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); - dst0 = __lasx_xvpickev_b(tmp0, tmp0); - dst1 = __lasx_xvpickod_b(tmp0, tmp0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst1, dst_v, 0, 0); - __lasx_xvstelm_d(dst1, dst_v, 8, 2); - src_uyvy += 64; - src_uyvy_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - __m256i src0, src1, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); - dst0 = __lasx_xvpickev_b(tmp0, tmp0); - dst1 = __lasx_xvpickod_b(tmp0, tmp0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst1, dst_v, 0, 0); - __lasx_xvstelm_d(dst1, dst_v, 8, 2); - src_uyvy += 64; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToUVRow_LASX(const uint8_t* src_argb0, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - const uint8_t* src_argb1 = src_argb0 + src_stride_argb; - - __m256i src0, src1, src2, src3, src4, src5, src6, src7; - __m256i vec0, vec1, vec2, vec3; - __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; - __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038, - 0x0038003800380038, 0x0038003800380038}; - __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025, - 0x0025002500250025, 0x0025002500250025}; - __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013, - 0x0013001300130013, 0x0013001300130013}; - __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f, - 0x002f002f002f002f, 0x002f002f002f002f}; - __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009, - 0x0009000900090009, 0x0009000900090009}; - __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; - __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, - src_argb0, 96, src0, src1, src2, src3); - DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64, - src_argb1, 96, src4, src5, src6, src7); - vec0 = __lasx_xvaddwev_h_bu(src0, src4); - vec1 = __lasx_xvaddwev_h_bu(src1, src5); - vec2 = __lasx_xvaddwev_h_bu(src2, src6); - vec3 = __lasx_xvaddwev_h_bu(src3, src7); - tmp0 = __lasx_xvpickev_h(vec1, vec0); - tmp1 = __lasx_xvpickev_h(vec3, vec2); - tmp2 = __lasx_xvpickod_h(vec1, vec0); - tmp3 = __lasx_xvpickod_h(vec3, vec2); - vec0 = __lasx_xvaddwod_h_bu(src0, src4); - vec1 = __lasx_xvaddwod_h_bu(src1, src5); - vec2 = __lasx_xvaddwod_h_bu(src2, src6); - vec3 = __lasx_xvaddwod_h_bu(src3, src7); - tmp4 = __lasx_xvpickev_h(vec1, vec0); - tmp5 = __lasx_xvpickev_h(vec3, vec2); - vec0 = __lasx_xvpickev_h(tmp1, tmp0); - vec1 = __lasx_xvpickod_h(tmp1, tmp0); - src0 = __lasx_xvavgr_h(vec0, vec1); - vec0 = __lasx_xvpickev_h(tmp3, tmp2); - vec1 = __lasx_xvpickod_h(tmp3, tmp2); - src1 = __lasx_xvavgr_h(vec0, vec1); - vec0 = __lasx_xvpickev_h(tmp5, tmp4); - vec1 = __lasx_xvpickod_h(tmp5, tmp4); - src2 = __lasx_xvavgr_h(vec0, vec1); - dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70); - dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A); - dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26); - dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70); - dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E); - dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12); - dst0 = __lasx_xvperm_w(dst0, control); - dst1 = __lasx_xvperm_w(dst1, control); - dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8); - dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst1, dst_v, 0, 0); - __lasx_xvstelm_d(dst1, dst_v, 8, 2); - src_argb0 += 128; - src_argb1 += 128; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - int len = (width / 32) - 1; - __m256i src0, src1, src2, src3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100, - 0x000000000E0D0C0A}; - __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, - 0x0000000700000003}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - tmp0 = __lasx_xvshuf_b(src0, src0, shuf); - tmp1 = __lasx_xvshuf_b(src1, src1, shuf); - tmp2 = __lasx_xvshuf_b(src2, src2, shuf); - tmp3 = __lasx_xvshuf_b(src3, src3, shuf); - tmp0 = __lasx_xvperm_w(tmp0, control); - tmp1 = __lasx_xvperm_w(tmp1, control); - tmp2 = __lasx_xvperm_w(tmp2, control); - tmp3 = __lasx_xvperm_w(tmp3, control); - __lasx_xvst(tmp0, dst_rgb, 0); - __lasx_xvst(tmp1, dst_rgb, 24); - __lasx_xvst(tmp2, dst_rgb, 48); - __lasx_xvst(tmp3, dst_rgb, 72); - dst_rgb += 96; - src_argb += 128; - } - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, - src0, src1, src2, src3); - tmp0 = __lasx_xvshuf_b(src0, src0, shuf); - tmp1 = __lasx_xvshuf_b(src1, src1, shuf); - tmp2 = __lasx_xvshuf_b(src2, src2, shuf); - tmp3 = __lasx_xvshuf_b(src3, src3, shuf); - tmp0 = __lasx_xvperm_w(tmp0, control); - tmp1 = __lasx_xvperm_w(tmp1, control); - tmp2 = __lasx_xvperm_w(tmp2, control); - tmp3 = __lasx_xvperm_w(tmp3, control); - __lasx_xvst(tmp0, dst_rgb, 0); - __lasx_xvst(tmp1, dst_rgb, 24); - __lasx_xvst(tmp2, dst_rgb, 48); - dst_rgb += 72; - __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); - __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); - __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); -} - -void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - int len = (width / 32) - 1; - __m256i src0, src1, src2, src3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102, - 0x000000000C0D0E08}; - __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, - 0x0000000700000003}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - tmp0 = __lasx_xvshuf_b(src0, src0, shuf); - tmp1 = __lasx_xvshuf_b(src1, src1, shuf); - tmp2 = __lasx_xvshuf_b(src2, src2, shuf); - tmp3 = __lasx_xvshuf_b(src3, src3, shuf); - tmp0 = __lasx_xvperm_w(tmp0, control); - tmp1 = __lasx_xvperm_w(tmp1, control); - tmp2 = __lasx_xvperm_w(tmp2, control); - tmp3 = __lasx_xvperm_w(tmp3, control); - __lasx_xvst(tmp0, dst_rgb, 0); - __lasx_xvst(tmp1, dst_rgb, 24); - __lasx_xvst(tmp2, dst_rgb, 48); - __lasx_xvst(tmp3, dst_rgb, 72); - dst_rgb += 96; - src_argb += 128; - } - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, - src0, src1, src2, src3); - tmp0 = __lasx_xvshuf_b(src0, src0, shuf); - tmp1 = __lasx_xvshuf_b(src1, src1, shuf); - tmp2 = __lasx_xvshuf_b(src2, src2, shuf); - tmp3 = __lasx_xvshuf_b(src3, src3, shuf); - tmp0 = __lasx_xvperm_w(tmp0, control); - tmp1 = __lasx_xvperm_w(tmp1, control); - tmp2 = __lasx_xvperm_w(tmp2, control); - tmp3 = __lasx_xvperm_w(tmp3, control); - __lasx_xvst(tmp0, dst_rgb, 0); - __lasx_xvst(tmp1, dst_rgb, 24); - __lasx_xvst(tmp2, dst_rgb, 48); - dst_rgb += 72; - __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); - __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); - __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); -} - -void ARGBToRGB565Row_LASX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - int len = width / 16; - __m256i zero = __lasx_xvldi(0); - __m256i src0, src1, tmp0, tmp1, dst0; - __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300, - 0x0300030003000300}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp0 = __lasx_xvsrli_b(tmp0, 3); - tmp1 = __lasx_xvpackev_b(zero, tmp1); - tmp1 = __lasx_xvsrli_h(tmp1, 2); - tmp0 = __lasx_xvsll_b(tmp0, shift); - tmp1 = __lasx_xvslli_h(tmp1, 5); - dst0 = __lasx_xvor_v(tmp0, tmp1); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_rgb, 0); - dst_rgb += 32; - src_argb += 64; - } -} - -void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - int len = width / 16; - __m256i zero = __lasx_xvldi(0); - __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; - __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703, - 0x0703070307030703}; - __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200, - 0x0200020002000200}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp0 = __lasx_xvsrli_b(tmp0, 3); - tmp1 = __lasx_xvsrl_b(tmp1, shift1); - tmp0 = __lasx_xvsll_b(tmp0, shift2); - tmp2 = __lasx_xvpackev_b(zero, tmp1); - tmp3 = __lasx_xvpackod_b(zero, tmp1); - tmp2 = __lasx_xvslli_h(tmp2, 5); - tmp3 = __lasx_xvslli_h(tmp3, 15); - dst0 = __lasx_xvor_v(tmp0, tmp2); - dst0 = __lasx_xvor_v(dst0, tmp3); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_rgb, 0); - dst_rgb += 32; - src_argb += 64; - } -} - -void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - int len = width / 16; - __m256i src0, src1, tmp0, tmp1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp1 = __lasx_xvandi_b(tmp1, 0xF0); - tmp0 = __lasx_xvsrli_b(tmp0, 4); - dst0 = __lasx_xvor_v(tmp1, tmp0); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_rgb, 0); - dst_rgb += 32; - src_argb += 64; - } -} - -void ARGBToUV444Row_LASX(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int32_t width) { - int x; - int len = width / 32; - __m256i src0, src1, src2, src3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, reg3, dst0, dst1; - __m256i const_112 = __lasx_xvldi(112); - __m256i const_74 = __lasx_xvldi(74); - __m256i const_38 = __lasx_xvldi(38); - __m256i const_94 = __lasx_xvldi(94); - __m256i const_18 = __lasx_xvldi(18); - __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - tmp0 = __lasx_xvpickev_h(src1, src0); - tmp1 = __lasx_xvpickod_h(src1, src0); - tmp2 = __lasx_xvpickev_h(src3, src2); - tmp3 = __lasx_xvpickod_h(src3, src2); - reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112); - reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112); - reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74); - reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74); - reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38); - reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38); - reg0 = __lasx_xvsub_h(reg0, reg2); - reg1 = __lasx_xvsub_h(reg1, reg3); - dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8); - dst0 = __lasx_xvperm_w(dst0, control); - reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112); - reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112); - reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18); - reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18); - reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94); - reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94); - reg0 = __lasx_xvsub_h(reg0, reg2); - reg1 = __lasx_xvsub_h(reg1, reg3); - dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8); - dst1 = __lasx_xvperm_w(dst1, control); - __lasx_xvst(dst0, dst_u, 0); - __lasx_xvst(dst1, dst_v, 0); - dst_u += 32; - dst_v += 32; - src_argb += 128; - } -} - -void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 8; - __m256i zero = __lasx_xvldi(0); - __m256i src0, src1, dst0, dst1; - __m256i tmp0, tmp1, tmp2, tmp3; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); - tmp0 = __lasx_xvilvl_b(src0, src0); - tmp1 = __lasx_xvilvh_b(src0, src0); - tmp2 = __lasx_xvilvl_b(zero, src1); - tmp3 = __lasx_xvilvh_b(zero, src1); - dst0 = __lasx_xvmuh_hu(tmp0, tmp2); - dst1 = __lasx_xvmuh_hu(tmp1, tmp3); - dst0 = __lasx_xvpickev_b(dst1, dst0); - __lasx_xvst(dst0, dst_argb, 0); - src_argb0 += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBAddRow_LASX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 8; - __m256i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); - dst0 = __lasx_xvsadd_bu(src0, src1); - __lasx_xvst(dst0, dst_argb, 0); - src_argb0 += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBSubtractRow_LASX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 8; - __m256i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); - dst0 = __lasx_xvssub_bu(src0, src1); - __lasx_xvst(dst0, dst_argb, 0); - src_argb0 += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBAttenuateRow_LASX(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m256i src0, src1, tmp0, tmp1; - __m256i reg0, reg1, reg2, reg3, reg4, reg5; - __m256i b, g, r, a, dst0, dst1; - __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000, - 0x0007000300060002}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - b = __lasx_xvpackev_b(tmp0, tmp0); - r = __lasx_xvpackod_b(tmp0, tmp0); - g = __lasx_xvpackev_b(tmp1, tmp1); - a = __lasx_xvpackod_b(tmp1, tmp1); - reg0 = __lasx_xvmulwev_w_hu(b, a); - reg1 = __lasx_xvmulwod_w_hu(b, a); - reg2 = __lasx_xvmulwev_w_hu(r, a); - reg3 = __lasx_xvmulwod_w_hu(r, a); - reg4 = __lasx_xvmulwev_w_hu(g, a); - reg5 = __lasx_xvmulwod_w_hu(g, a); - reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24); - reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24); - reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24); - reg0 = __lasx_xvshuf_h(control, reg0, reg0); - reg2 = __lasx_xvshuf_h(control, reg2, reg2); - reg4 = __lasx_xvshuf_h(control, reg4, reg4); - tmp0 = __lasx_xvpackev_b(reg4, reg0); - tmp1 = __lasx_xvpackev_b(a, reg2); - dst0 = __lasx_xvilvl_h(tmp1, tmp0); - dst1 = __lasx_xvilvh_h(tmp1, tmp0); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - dst_argb += 64; - src_argb += 64; - } -} - -void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - int x; - int len = width / 16; - __m256i src0, src1, tmp0, tmp1, dst0; - __m256i b, g, r; - __m256i zero = __lasx_xvldi(0); - __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0); - - vec_dither = __lasx_xvilvl_b(zero, vec_dither); - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - b = __lasx_xvpackev_b(zero, tmp0); - r = __lasx_xvpackod_b(zero, tmp0); - g = __lasx_xvpackev_b(zero, tmp1); - b = __lasx_xvadd_h(b, vec_dither); - g = __lasx_xvadd_h(g, vec_dither); - r = __lasx_xvadd_h(r, vec_dither); - DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g); - r = __lasx_xvclip255_h(r); - b = __lasx_xvsrai_h(b, 3); - g = __lasx_xvsrai_h(g, 2); - r = __lasx_xvsrai_h(r, 3); - g = __lasx_xvslli_h(g, 5); - r = __lasx_xvslli_h(r, 11); - dst0 = __lasx_xvor_v(b, g); - dst0 = __lasx_xvor_v(dst0, r); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_rgb, 0); - src_argb += 64; - dst_rgb += 32; - } -} - -void ARGBShuffleRow_LASX(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - int x; - int len = width / 16; - __m256i src0, src1, dst0, dst1; - __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000, - 0x0C0C0C0C08080808}; - __m256i temp = __lasx_xvldrepl_w(shuffler, 0); - - shuf = __lasx_xvadd_b(shuf, temp); - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - dst0 = __lasx_xvshuf_b(src0, src0, shuf); - dst1 = __lasx_xvshuf_b(src1, src1, shuf); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - src_argb += 64; - dst_argb += 64; - } -} - -void ARGBShadeRow_LASX(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - int x; - int len = width / 8; - __m256i src0, dst0, tmp0, tmp1; - __m256i vec_value = __lasx_xvreplgr2vr_w(value); - - vec_value = __lasx_xvilvl_b(vec_value, vec_value); - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_argb, 0); - tmp0 = __lasx_xvilvl_b(src0, src0); - tmp1 = __lasx_xvilvh_b(src0, src0); - tmp0 = __lasx_xvmuh_hu(tmp0, vec_value); - tmp1 = __lasx_xvmuh_hu(tmp1, vec_value); - dst0 = __lasx_xvpickod_b(tmp1, tmp0); - __lasx_xvst(dst0, dst_argb, 0); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int x; - int len = width / 16; - __m256i src0, src1, tmp0, tmp1; - __m256i reg0, reg1, reg2, dst0, dst1; - __m256i const_128 = __lasx_xvldi(0x480); - __m256i const_150 = __lasx_xvldi(0x96); - __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, - 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - reg0 = __lasx_xvdp2_h_bu(tmp0, const_br); - reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); - reg2 = __lasx_xvadd_h(reg0, reg1); - tmp0 = __lasx_xvpackod_b(reg2, reg2); - tmp1 = __lasx_xvpackod_b(tmp1, reg2); - dst0 = __lasx_xvilvl_h(tmp1, tmp0); - dst1 = __lasx_xvilvh_h(tmp1, tmp0); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - src_argb += 64; - dst_argb += 64; - } -} - -void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) { - int x; - int len = width / 16; - __m256i src0, src1, tmp0, tmp1; - __m256i reg0, reg1, spb, spg, spr; - __m256i dst0, dst1; - __m256i spb_g = __lasx_xvldi(68); - __m256i spg_g = __lasx_xvldi(88); - __m256i spr_g = __lasx_xvldi(98); - __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311, - 0x2311231123112311}; - __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16, - 0x2D162D162D162D16}; - __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218, - 0x3218321832183218}; - __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100, - 0x1F0E1D0C1B0A1908}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); - spr = __lasx_xvdp2_h_bu(tmp0, spr_br); - spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g); - spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g); - spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g); - spb = __lasx_xvsrli_h(spb, 7); - spg = __lasx_xvsrli_h(spg, 7); - spr = __lasx_xvsrli_h(spr, 7); - spg = __lasx_xvsat_hu(spg, 7); - spr = __lasx_xvsat_hu(spr, 7); - reg0 = __lasx_xvpackev_b(spg, spb); - reg1 = __lasx_xvshuf_b(tmp1, spr, shuff); - dst0 = __lasx_xvilvl_h(reg1, reg0); - dst1 = __lasx_xvilvh_h(reg1, reg0); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - dst_argb += 64; - } -} - -void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 32; - __m256i src0, src1; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, reg3; - __m256i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_argb4444, 0); - src1 = __lasx_xvld(src_argb4444, 32); - DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0, - tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2); - DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3); - DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, - tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2); - DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2, - 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - __lasx_xvst(dst2, dst_argb, 64); - __lasx_xvst(dst3, dst_argb, 96); - src_argb4444 += 64; - dst_argb += 128; - } -} - -void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 32; - __m256i src0, src1; - __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; - __m256i reg0, reg1, reg2, reg3; - __m256i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_argb1555, 0); - src1 = __lasx_xvld(src_argb1555, 32); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - tmpg = __lasx_xvsrli_b(tmp0, 5); - reg0 = __lasx_xvandi_b(tmp1, 0x03); - reg0 = __lasx_xvslli_b(reg0, 3); - tmpg = __lasx_xvor_v(tmpg, reg0); - reg1 = __lasx_xvandi_b(tmp1, 0x7C); - tmpr = __lasx_xvsrli_b(reg1, 2); - tmpa = __lasx_xvsrli_b(tmp1, 7); - tmpa = __lasx_xvneg_b(tmpa); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvslli_b(tmpg, 3); - reg2 = __lasx_xvslli_b(tmpr, 3); - tmpb = __lasx_xvsrli_b(tmpb, 2); - tmpg = __lasx_xvsrli_b(tmpg, 2); - tmpr = __lasx_xvsrli_b(tmpr, 2); - tmpb = __lasx_xvor_v(reg0, tmpb); - tmpg = __lasx_xvor_v(reg1, tmpg); - tmpr = __lasx_xvor_v(reg2, tmpr); - DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); - DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3); - dst0 = __lasx_xvilvl_h(reg1, reg0); - dst1 = __lasx_xvilvh_h(reg1, reg0); - dst2 = __lasx_xvilvl_h(reg3, reg2); - dst3 = __lasx_xvilvh_h(reg3, reg2); - DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, - 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); - __lasx_xvst(reg0, dst_argb, 0); - __lasx_xvst(reg1, dst_argb, 32); - __lasx_xvst(reg2, dst_argb, 64); - __lasx_xvst(reg3, dst_argb, 96); - src_argb1555 += 64; - dst_argb += 128; - } -} - -void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 32; - __m256i src0, src1; - __m256i tmp0, tmp1, tmpb, tmpg, tmpr; - __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3; - __m256i alpha = __lasx_xvldi(0xFF); - - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_rgb565, 0); - src1 = __lasx_xvld(src_rgb565, 32); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - tmpr = __lasx_xvandi_b(tmp1, 0xF8); - reg1 = __lasx_xvandi_b(tmp1, 0x07); - reg0 = __lasx_xvsrli_b(tmp0, 5); - reg1 = __lasx_xvslli_b(reg1, 3); - tmpg = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvsrli_b(tmpb, 2); - tmpb = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvslli_b(tmpg, 2); - reg1 = __lasx_xvsrli_b(tmpg, 4); - tmpg = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvsrli_b(tmpr, 5); - tmpr = __lasx_xvor_v(tmpr, reg0); - DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); - dst0 = __lasx_xvilvl_h(reg1, reg0); - dst1 = __lasx_xvilvh_h(reg1, reg0); - DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); - dst2 = __lasx_xvilvl_h(reg1, reg0); - dst3 = __lasx_xvilvh_h(reg1, reg0); - DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, - 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); - __lasx_xvst(reg0, dst_argb, 0); - __lasx_xvst(reg1, dst_argb, 32); - __lasx_xvst(reg2, dst_argb, 64); - __lasx_xvst(reg3, dst_argb, 96); - src_rgb565 += 64; - dst_argb += 128; - } -} - -void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2; - __m256i dst0, dst1, dst2, dst3; - __m256i reg0, reg1, reg2, reg3; - __m256i alpha = __lasx_xvldi(0xFF); - __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, - 0x1B1A191817161514}; - __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, - 0x0706050403020100}; - __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, - 0x131211100F0E0D0C}; - __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100, - 0x100B0A0910080706}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_rgb24, 0); - reg1 = __lasx_xvld(src_rgb24, 32); - reg2 = __lasx_xvld(src_rgb24, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, - tmp1); - tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); - DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, - tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, - 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - __lasx_xvst(dst2, dst_argb, 64); - __lasx_xvst(dst3, dst_argb, 96); - src_rgb24 += 96; - dst_argb += 128; - } -} - -void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3; - __m256i dst0, dst1, dst2, dst3; - __m256i alpha = __lasx_xvldi(0xFF); - __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, - 0x1B1A191817161514}; - __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, - 0x0706050403020100}; - __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, - 0x131211100F0E0D0C}; - __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102, - 0x10090A0B10060708}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_raw, 0); - reg1 = __lasx_xvld(src_raw, 32); - reg2 = __lasx_xvld(src_raw, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, - tmp1); - tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); - DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, - tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, - 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); - __lasx_xvst(dst0, dst_argb, 0); - __lasx_xvst(dst1, dst_argb, 32); - __lasx_xvst(dst2, dst_argb, 64); - __lasx_xvst(dst3, dst_argb, 96); - src_raw += 96; - dst_argb += 128; - } -} - -void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - int x; - int len = width / 32; - __m256i src0, src1; - __m256i tmp0, tmp1, tmpb, tmpg, tmpr; - __m256i reg0, reg1, reg2, dst0; - __m256i const_66 = __lasx_xvldi(66); - __m256i const_129 = __lasx_xvldi(129); - __m256i const_25 = __lasx_xvldi(25); - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_argb1555, 0); - src1 = __lasx_xvld(src_argb1555, 32); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - tmpg = __lasx_xvsrli_b(tmp0, 5); - reg0 = __lasx_xvandi_b(tmp1, 0x03); - reg0 = __lasx_xvslli_b(reg0, 3); - tmpg = __lasx_xvor_v(tmpg, reg0); - reg1 = __lasx_xvandi_b(tmp1, 0x7C); - tmpr = __lasx_xvsrli_b(reg1, 2); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvslli_b(tmpg, 3); - reg2 = __lasx_xvslli_b(tmpr, 3); - tmpb = __lasx_xvsrli_b(tmpb, 2); - tmpg = __lasx_xvsrli_b(tmpg, 2); - tmpr = __lasx_xvsrli_b(tmpr, 2); - tmpb = __lasx_xvor_v(reg0, tmpb); - tmpg = __lasx_xvor_v(reg1, tmpg); - tmpr = __lasx_xvor_v(reg2, tmpr); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); - reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); - reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); - reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); - reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); - reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lasx_xvpackod_b(reg1, reg0); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_y, 0); - src_argb1555 += 64; - dst_y += 32; - } -} - -void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; - __m256i src0, src1, src2, src3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m256i reg0, reg1, reg2, reg3, dst0; - __m256i const_112 = __lasx_xvldi(0x438); - __m256i const_74 = __lasx_xvldi(0x425); - __m256i const_38 = __lasx_xvldi(0x413); - __m256i const_94 = __lasx_xvldi(0x42F); - __m256i const_18 = __lasx_xvldi(0x409); - __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0, - next_argb1555, 32, src0, src1, src2, src3); - DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); - DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - nexb = __lasx_xvandi_b(tmp2, 0x1F); - tmpg = __lasx_xvsrli_b(tmp0, 5); - nexg = __lasx_xvsrli_b(tmp2, 5); - reg0 = __lasx_xvandi_b(tmp1, 0x03); - reg2 = __lasx_xvandi_b(tmp3, 0x03); - reg0 = __lasx_xvslli_b(reg0, 3); - reg2 = __lasx_xvslli_b(reg2, 3); - tmpg = __lasx_xvor_v(tmpg, reg0); - nexg = __lasx_xvor_v(nexg, reg2); - reg1 = __lasx_xvandi_b(tmp1, 0x7C); - reg3 = __lasx_xvandi_b(tmp3, 0x7C); - tmpr = __lasx_xvsrli_b(reg1, 2); - nexr = __lasx_xvsrli_b(reg3, 2); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvslli_b(tmpg, 3); - reg2 = __lasx_xvslli_b(tmpr, 3); - tmpb = __lasx_xvsrli_b(tmpb, 2); - tmpg = __lasx_xvsrli_b(tmpg, 2); - tmpr = __lasx_xvsrli_b(tmpr, 2); - tmpb = __lasx_xvor_v(reg0, tmpb); - tmpg = __lasx_xvor_v(reg1, tmpg); - tmpr = __lasx_xvor_v(reg2, tmpr); - reg0 = __lasx_xvslli_b(nexb, 3); - reg1 = __lasx_xvslli_b(nexg, 3); - reg2 = __lasx_xvslli_b(nexr, 3); - nexb = __lasx_xvsrli_b(nexb, 2); - nexg = __lasx_xvsrli_b(nexg, 2); - nexr = __lasx_xvsrli_b(nexr, 2); - nexb = __lasx_xvor_v(reg0, nexb); - nexg = __lasx_xvor_v(reg1, nexg); - nexr = __lasx_xvor_v(reg2, nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); - reg0 = __lasx_xvpermi_d(reg0, 0xD8); - reg1 = __lasx_xvpermi_d(reg1, 0xD8); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_v, 0, 1); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst0, dst_v, 8, 3); - src_argb1555 += 64; - next_argb1555 += 64; - dst_u += 16; - dst_v += 16; - } -} - -void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1; - __m256i tmp0, tmp1, tmpb, tmpg, tmpr; - __m256i reg0, reg1, dst0; - __m256i const_66 = __lasx_xvldi(66); - __m256i const_129 = __lasx_xvldi(129); - __m256i const_25 = __lasx_xvldi(25); - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - src0 = __lasx_xvld(src_rgb565, 0); - src1 = __lasx_xvld(src_rgb565, 32); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - tmpr = __lasx_xvandi_b(tmp1, 0xF8); - reg1 = __lasx_xvandi_b(tmp1, 0x07); - reg0 = __lasx_xvsrli_b(tmp0, 5); - reg1 = __lasx_xvslli_b(reg1, 3); - tmpg = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvsrli_b(tmpb, 2); - tmpb = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvslli_b(tmpg, 2); - reg1 = __lasx_xvsrli_b(tmpg, 4); - tmpg = __lasx_xvor_v(reg1, reg0); - reg0 = __lasx_xvsrli_b(tmpr, 5); - tmpr = __lasx_xvor_v(tmpr, reg0); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); - reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); - reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); - reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); - reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); - reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lasx_xvpackod_b(reg1, reg0); - dst0 = __lasx_xvpermi_d(dst0, 0xD8); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_rgb565 += 64; - } -} - -void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; - __m256i src0, src1, src2, src3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m256i reg0, reg1, reg2, reg3, dst0; - __m256i const_112 = __lasx_xvldi(0x438); - __m256i const_74 = __lasx_xvldi(0x425); - __m256i const_38 = __lasx_xvldi(0x413); - __m256i const_94 = __lasx_xvldi(0x42F); - __m256i const_18 = __lasx_xvldi(0x409); - __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0, - next_rgb565, 32, src0, src1, src2, src3); - DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); - DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); - tmpb = __lasx_xvandi_b(tmp0, 0x1F); - tmpr = __lasx_xvandi_b(tmp1, 0xF8); - nexb = __lasx_xvandi_b(tmp2, 0x1F); - nexr = __lasx_xvandi_b(tmp3, 0xF8); - reg1 = __lasx_xvandi_b(tmp1, 0x07); - reg3 = __lasx_xvandi_b(tmp3, 0x07); - reg0 = __lasx_xvsrli_b(tmp0, 5); - reg1 = __lasx_xvslli_b(reg1, 3); - reg2 = __lasx_xvsrli_b(tmp2, 5); - reg3 = __lasx_xvslli_b(reg3, 3); - tmpg = __lasx_xvor_v(reg1, reg0); - nexg = __lasx_xvor_v(reg2, reg3); - reg0 = __lasx_xvslli_b(tmpb, 3); - reg1 = __lasx_xvsrli_b(tmpb, 2); - reg2 = __lasx_xvslli_b(nexb, 3); - reg3 = __lasx_xvsrli_b(nexb, 2); - tmpb = __lasx_xvor_v(reg1, reg0); - nexb = __lasx_xvor_v(reg2, reg3); - reg0 = __lasx_xvslli_b(tmpg, 2); - reg1 = __lasx_xvsrli_b(tmpg, 4); - reg2 = __lasx_xvslli_b(nexg, 2); - reg3 = __lasx_xvsrli_b(nexg, 4); - tmpg = __lasx_xvor_v(reg1, reg0); - nexg = __lasx_xvor_v(reg2, reg3); - reg0 = __lasx_xvsrli_b(tmpr, 5); - reg2 = __lasx_xvsrli_b(nexr, 5); - tmpr = __lasx_xvor_v(tmpr, reg0); - nexr = __lasx_xvor_v(nexr, reg2); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); - reg0 = __lasx_xvpermi_d(reg0, 0xD8); - reg1 = __lasx_xvpermi_d(reg1, 0xD8); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_v, 0, 1); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst0, dst_v, 8, 3); - dst_u += 16; - dst_v += 16; - src_rgb565 += 64; - next_rgb565 += 64; - } -} - -void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; - int len = width / 32; - __m256i src0, src1, src2, reg0, reg1, reg2; - __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; - __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m256i const_112 = __lasx_xvldi(0x438); - __m256i const_74 = __lasx_xvldi(0x425); - __m256i const_38 = __lasx_xvldi(0x413); - __m256i const_94 = __lasx_xvldi(0x42F); - __m256i const_18 = __lasx_xvldi(0x409); - __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18, - 0x15120F0C09060300, 0x00000000001E1B18}; - __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908, - 0x0706050403020100, 0x1D1A1714110A0908}; - __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, - 0x1613100D0A070401, 0x00000000001F1C19}; - __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, - 0x0706050403020100, 0x1E1B1815120A0908}; - __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A, - 0x1714110E0B080502, 0x0000000000001D1A}; - __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908, - 0x0706050403020100, 0x1F1C191613100908}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64, - next_rgb24, 0, reg0, reg1, reg2, tmp0); - DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, - 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); - DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, - nexb); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, - nexg); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, - nexr); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, - nexb); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, - nexg); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, - nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_v, 0, 1); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst0, dst_v, 8, 3); - src_rgb24 += 96; - next_rgb24 += 96; - dst_u += 16; - dst_v += 16; - } -} - -void RAWToUVRow_LASX(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_raw = src_raw + src_stride_raw; - int len = width / 32; - __m256i src0, src1, src2, reg0, reg1, reg2; - __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; - __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m256i const_112 = __lasx_xvldi(0x438); - __m256i const_74 = __lasx_xvldi(0x425); - __m256i const_38 = __lasx_xvldi(0x413); - __m256i const_94 = __lasx_xvldi(0x42F); - __m256i const_18 = __lasx_xvldi(0x409); - __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18, - 0x15120F0C09060300, 0x00000000001E1B18}; - __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908, - 0x0706050403020100, 0x1D1A1714110A0908}; - __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, - 0x1613100D0A070401, 0x00000000001F1C19}; - __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, - 0x0706050403020100, 0x1E1B1815120A0908}; - __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A, - 0x1714110E0B080502, 0x0000000000001D1A}; - __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908, - 0x0706050403020100, 0x1F1C191613100908}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0, - reg0, reg1, reg2, tmp0); - DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, - 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); - DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, - nexb); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, - nexg); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, - nexr); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, - nexb); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, - nexg); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, - nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_v, 0, 1); - __lasx_xvstelm_d(dst0, dst_u, 8, 2); - __lasx_xvstelm_d(dst0, dst_v, 8, 3); - src_raw += 96; - next_raw += 96; - dst_u += 16; - dst_v += 16; - } -} - -void NV12ToARGBRow_LASX(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_vrub, vec_vgug, vec_y, vec_vu; - __m256i out_b, out_g, out_r; - __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = __lasx_xvldi(0xFF); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); - vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - vec_y = __lasx_xvld(src_y, 0); - vec_vu = __lasx_xvld(src_uv, 0); - vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); - vec_vu = __lasx_vext2xv_h_b(vec_vu); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, - out_b); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_y += 16; - src_uv += 16; - } -} - -void NV12ToRGB565Row_LASX(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_vrub, vec_vgug, vec_y, vec_vu; - __m256i out_b, out_g, out_r; - __m256i const_0x80 = __lasx_xvldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); - vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - vec_y = __lasx_xvld(src_y, 0); - vec_vu = __lasx_xvld(src_uv, 0); - vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); - vec_vu = __lasx_vext2xv_h_b(vec_vu); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, - out_b); - out_b = __lasx_xvsrli_h(out_b, 3); - out_g = __lasx_xvsrli_h(out_g, 2); - out_r = __lasx_xvsrli_h(out_r, 3); - out_g = __lasx_xvslli_h(out_g, 5); - out_r = __lasx_xvslli_h(out_r, 11); - out_r = __lasx_xvor_v(out_r, out_g); - out_r = __lasx_xvor_v(out_r, out_b); - __lasx_xvst(out_r, dst_rgb565, 0); - src_y += 16; - src_uv += 16; - dst_rgb565 += 32; - } -} - -void NV21ToARGBRow_LASX(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; - __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv; - __m256i out_b, out_g, out_r; - __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = __lasx_xvldi(0xFF); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); - vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - vec_y = __lasx_xvld(src_y, 0); - vec_uv = __lasx_xvld(src_uv, 0); - vec_uv = __lasx_xvsub_b(vec_uv, const_0x80); - vec_uv = __lasx_vext2xv_h_b(vec_uv); - YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g, - out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_y += 16; - src_uv += 16; - } -} - -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; - uint16_t pad; -}; - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; - -// ARGB expects first 3 values to contain RGB and 4th value is ignored. -static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile ( - "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants - "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants - "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants - "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants - "xvld $xr20, %4, 0 \n\t" // load shuff - "1: \n\t" - "xvld $xr4, %0, 0 \n\t" - "xvld $xr5, %0, 32 \n\t" - "xvld $xr6, %0, 64 \n\t" - "xvld $xr7, %0, 96 \n\t" // load 32 pixels of - // ARGB - "xvor.v $xr12, $xr3, $xr3 \n\t" - "xvor.v $xr13, $xr3, $xr3 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per - // loop. - "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR - "xvpickev.b $xr10, $xr7, $xr6 \n\t" - "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA - "xvpickod.b $xr11, $xr7, $xr6 \n\t" - "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B - "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t" - "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G - "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t" - "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R - "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t" - "addi.d %0, %0, 128 \n\t" - "xvpickod.b $xr10, $xr13, $xr12 \n\t" - "xvperm.w $xr11, $xr10, $xr20 \n\t" - "xvst $xr11, %1, 0 \n\t" - "addi.d %1, %1, 32 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_argb), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), "r"(shuff) - : "memory"); -} - -void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants); -} - -void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} - -void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants); -} - -void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants); -} - -// RGBA expects first value to be A and ignored, then 3 values to contain RGB. -// Same code as ARGB, except the LD4 -static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile ( - "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants - "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants - "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants - "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants - "xvld $xr20, %4, 0 \n\t" // load shuff - "1: \n\t" - "xvld $xr4, %0, 0 \n\t" - "xvld $xr5, %0, 32 \n\t" - "xvld $xr6, %0, 64 \n\t" - "xvld $xr7, %0, 96 \n\t" // load 32 pixels of - // RGBA - "xvor.v $xr12, $xr3, $xr3 \n\t" - "xvor.v $xr13, $xr3, $xr3 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per - // loop. - "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG - "xvpickev.b $xr10, $xr7, $xr6 \n\t" - "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR - "xvpickod.b $xr11, $xr7, $xr6 \n\t" - "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B - "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t" - "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G - "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t" - "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R - "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t" - "addi.d %0, %0, 128 \n\t" - "xvpickod.b $xr10, $xr13, $xr12 \n\t" - "xvperm.w $xr11, $xr10, $xr20 \n\t" - "xvst $xr11, %1, 0 \n\t" - "addi.d %1, %1, 32 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), "r"(shuff) - : "memory"); -} - -void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants); -} - -void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); -} - -void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants); -} - -static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - int8_t shuff[128] = { - 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile ( - "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants - "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants - "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants - "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants - "xvld $xr4, %4, 0 \n\t" // load shuff - "xvld $xr5, %4, 32 \n\t" - "xvld $xr6, %4, 64 \n\t" - "xvld $xr7, %4, 96 \n\t" - "1: \n\t" - "xvld $xr8, %0, 0 \n\t" - "xvld $xr9, %0, 32 \n\t" - "xvld $xr10, %0, 64 \n\t" // load 32 pixels of - // RGB - "xvor.v $xr12, $xr3, $xr3 \n\t" - "xvor.v $xr13, $xr3, $xr3 \n\t" - "xvor.v $xr11, $xr9, $xr9 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per - // loop. - "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 - "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 - "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 - "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t" - "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t" - "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t" - "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t" - "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G - "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t" - "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B - "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t" - "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R - "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t" - "addi.d %0, %0, 96 \n\t" - "xvpickod.b $xr10, $xr13, $xr12 \n\t" - "xvst $xr10, %1, 0 \n\t" - "addi.d %1, %1, 32 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), // %3 - "r"(shuff) // %4 - : "memory"); -} - -void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); -} - -void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); -} - -void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); -} - -void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); -} - -void ARGBToUVJRow_LASX(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_argb = src_argb + src_stride_argb; - int len = width / 32; - __m256i src0, src1, src2, src3; - __m256i nex0, nex1, nex2, nex3; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, dst0; - __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m256i const_63 = __lasx_xvldi(0x43F); - __m256i const_42 = __lasx_xvldi(0x42A); - __m256i const_21 = __lasx_xvldi(0x415); - __m256i const_53 = __lasx_xvldi(0x435); - __m256i const_10 = __lasx_xvldi(0x40A); - __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080, - 0x8080808080808080, 0x8080808080808080}; - __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301, - 0x1F1D0F0D1B190B09}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64, - next_argb, 96, nex0, nex1, nex2, nex3); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp2 = __lasx_xvpickev_b(src3, src2); - tmp3 = __lasx_xvpickod_b(src3, src2); - tmpr = __lasx_xvpickod_b(tmp2, tmp0); - tmpb = __lasx_xvpickev_b(tmp2, tmp0); - tmpg = __lasx_xvpickev_b(tmp3, tmp1); - tmp0 = __lasx_xvpickev_b(nex1, nex0); - tmp1 = __lasx_xvpickod_b(nex1, nex0); - tmp2 = __lasx_xvpickev_b(nex3, nex2); - tmp3 = __lasx_xvpickod_b(nex3, nex2); - nexr = __lasx_xvpickod_b(tmp2, tmp0); - nexb = __lasx_xvpickev_b(tmp2, tmp0); - nexg = __lasx_xvpickev_b(tmp3, tmp1); - tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb); - tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb); - tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg); - tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg); - reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr); - reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr); - tmpb = __lasx_xvavgr_hu(tmp0, tmp1); - tmpg = __lasx_xvavgr_hu(tmp2, tmp3); - tmpr = __lasx_xvavgr_hu(reg0, reg1); - reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb); - reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr); - reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg); - reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg); - reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr); - reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb); - dst0 = __lasx_xvpackod_b(reg1, reg0); - tmp0 = __lasx_xvpermi_d(dst0, 0x44); - tmp1 = __lasx_xvpermi_d(dst0, 0xEE); - dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff); - __lasx_xvstelm_d(dst0, dst_u, 0, 0); - __lasx_xvstelm_d(dst0, dst_v, 0, 2); - __lasx_xvstelm_d(dst0, dst_u, 8, 1); - __lasx_xvstelm_d(dst0, dst_v, 8, 3); - dst_u += 16; - dst_v += 16; - src_argb += 128; - next_argb += 128; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) diff --git a/drivers/media/pci/tbscapture2/row_lsx.c b/drivers/media/pci/tbscapture2/row_lsx.c deleted file mode 100644 index 3753b150a4b3..000000000000 --- a/drivers/media/pci/tbscapture2/row_lsx.c +++ /dev/null @@ -1,2987 +0,0 @@ -/* - * Copyright 2022 The LibYuv Project Authors. All rights reserved. - * - * Copyright (c) 2022 Loongson Technology Corporation Limited - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) -#include "loongson_intrinsics.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \ - { \ - ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \ - vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \ - ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \ - vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \ - yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \ - yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ - } - -// Load 32 YUV422 pixel data -#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ - { \ - __m128i temp0, temp1; \ - \ - DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ - temp1 = __lsx_vld(psrc_v, 0); \ - temp0 = __lsx_vsub_b(temp0, const_80); \ - temp1 = __lsx_vsub_b(temp1, const_80); \ - temp0 = __lsx_vsllwil_h_b(temp0, 0); \ - temp1 = __lsx_vsllwil_h_b(temp1, 0); \ - uv_l = __lsx_vilvl_h(temp0, temp1); \ - uv_h = __lsx_vilvh_h(temp0, temp1); \ - } - -// Load 16 YUV422 pixel data -#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ - { \ - __m128i temp0, temp1; \ - \ - out_y = __lsx_vld(psrc_y, 0); \ - temp0 = __lsx_vldrepl_d(psrc_u, 0); \ - temp1 = __lsx_vldrepl_d(psrc_v, 0); \ - uv = __lsx_vilvl_b(temp0, temp1); \ - uv = __lsx_vsub_b(uv, const_80); \ - uv = __lsx_vsllwil_h_b(uv, 0); \ - } - -// Convert 16 pixels of YUV420 to RGB. -#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ - g_h, r_l, r_h) \ - { \ - __m128i u_l, u_h, v_l, v_h; \ - __m128i yl_ev, yl_od, yh_ev, yh_od; \ - __m128i temp0, temp1, temp2, temp3; \ - \ - temp0 = __lsx_vilvl_b(in_y, in_y); \ - temp1 = __lsx_vilvh_b(in_y, in_y); \ - yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \ - yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \ - yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \ - yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \ - DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ - yl_ev, yl_od, yh_ev, yh_od); \ - yl_ev = __lsx_vadd_w(yl_ev, yb); \ - yl_od = __lsx_vadd_w(yl_od, yb); \ - yh_ev = __lsx_vadd_w(yh_ev, yb); \ - yh_od = __lsx_vadd_w(yh_od, yb); \ - v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \ - u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \ - v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \ - u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \ - temp0 = __lsx_vadd_w(yl_ev, u_l); \ - temp1 = __lsx_vadd_w(yl_od, u_l); \ - temp2 = __lsx_vadd_w(yh_ev, u_h); \ - temp3 = __lsx_vadd_w(yh_od, u_h); \ - DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - b_l = __lsx_vpackev_h(temp1, temp0); \ - b_h = __lsx_vpackev_h(temp3, temp2); \ - temp0 = __lsx_vadd_w(yl_ev, v_l); \ - temp1 = __lsx_vadd_w(yl_od, v_l); \ - temp2 = __lsx_vadd_w(yh_ev, v_h); \ - temp3 = __lsx_vadd_w(yh_od, v_h); \ - DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - r_l = __lsx_vpackev_h(temp1, temp0); \ - r_h = __lsx_vpackev_h(temp3, temp2); \ - DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ - temp0 = __lsx_vsub_w(yl_ev, u_l); \ - temp1 = __lsx_vsub_w(yl_od, u_l); \ - temp2 = __lsx_vsub_w(yh_ev, u_h); \ - temp3 = __lsx_vsub_w(yh_od, u_h); \ - DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ - temp1, temp2, temp3); \ - DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ - temp2, temp3); \ - g_l = __lsx_vpackev_h(temp1, temp0); \ - g_h = __lsx_vpackev_h(temp3, temp2); \ - } - -// Convert 8 pixels of YUV420 to RGB. -#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ - { \ - __m128i y_ev, y_od, u_l, v_l; \ - __m128i tmp0, tmp1, tmp2, tmp3; \ - \ - tmp0 = __lsx_vilvl_b(in_y, in_y); \ - y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \ - y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \ - y_ev = __lsx_vsrai_w(y_ev, 16); \ - y_od = __lsx_vsrai_w(y_od, 16); \ - y_ev = __lsx_vadd_w(y_ev, yb); \ - y_od = __lsx_vadd_w(y_od, yb); \ - in_vu = __lsx_vilvl_b(zero, in_vu); \ - in_vu = __lsx_vsub_h(in_vu, const_80); \ - u_l = __lsx_vmulwev_w_h(in_vu, vrub); \ - v_l = __lsx_vmulwod_w_h(in_vu, vrub); \ - tmp0 = __lsx_vadd_w(y_ev, u_l); \ - tmp1 = __lsx_vadd_w(y_od, u_l); \ - tmp2 = __lsx_vadd_w(y_ev, v_l); \ - tmp3 = __lsx_vadd_w(y_od, v_l); \ - tmp0 = __lsx_vsrai_w(tmp0, 6); \ - tmp1 = __lsx_vsrai_w(tmp1, 6); \ - tmp2 = __lsx_vsrai_w(tmp2, 6); \ - tmp3 = __lsx_vsrai_w(tmp3, 6); \ - tmp0 = __lsx_vclip255_w(tmp0); \ - tmp1 = __lsx_vclip255_w(tmp1); \ - tmp2 = __lsx_vclip255_w(tmp2); \ - tmp3 = __lsx_vclip255_w(tmp3); \ - out_b = __lsx_vpackev_h(tmp1, tmp0); \ - out_r = __lsx_vpackev_h(tmp3, tmp2); \ - tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \ - tmp1 = __lsx_vsub_w(y_ev, tmp0); \ - tmp2 = __lsx_vsub_w(y_od, tmp0); \ - tmp1 = __lsx_vsrai_w(tmp1, 6); \ - tmp2 = __lsx_vsrai_w(tmp2, 6); \ - tmp1 = __lsx_vclip255_w(tmp1); \ - tmp2 = __lsx_vclip255_w(tmp2); \ - out_g = __lsx_vpackev_h(tmp2, tmp1); \ - } - -// Convert I444 pixels of YUV420 to RGB. -#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \ - out_r) \ - { \ - __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \ - __m128i tmp0, tmp1, tmp2, tmp3; \ - \ - y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \ - y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \ - y_ev = __lsx_vsrai_w(y_ev, 16); \ - y_od = __lsx_vsrai_w(y_od, 16); \ - y_ev = __lsx_vadd_w(y_ev, yb); \ - y_od = __lsx_vadd_w(y_od, yb); \ - in_u = __lsx_vsub_h(in_u, const_80); \ - in_v = __lsx_vsub_h(in_v, const_80); \ - u_ev = __lsx_vmulwev_w_h(in_u, ub); \ - u_od = __lsx_vmulwod_w_h(in_u, ub); \ - v_ev = __lsx_vmulwev_w_h(in_v, vr); \ - v_od = __lsx_vmulwod_w_h(in_v, vr); \ - tmp0 = __lsx_vadd_w(y_ev, u_ev); \ - tmp1 = __lsx_vadd_w(y_od, u_od); \ - tmp2 = __lsx_vadd_w(y_ev, v_ev); \ - tmp3 = __lsx_vadd_w(y_od, v_od); \ - tmp0 = __lsx_vsrai_w(tmp0, 6); \ - tmp1 = __lsx_vsrai_w(tmp1, 6); \ - tmp2 = __lsx_vsrai_w(tmp2, 6); \ - tmp3 = __lsx_vsrai_w(tmp3, 6); \ - tmp0 = __lsx_vclip255_w(tmp0); \ - tmp1 = __lsx_vclip255_w(tmp1); \ - tmp2 = __lsx_vclip255_w(tmp2); \ - tmp3 = __lsx_vclip255_w(tmp3); \ - out_b = __lsx_vpackev_h(tmp1, tmp0); \ - out_r = __lsx_vpackev_h(tmp3, tmp2); \ - u_ev = __lsx_vpackev_h(in_u, in_v); \ - u_od = __lsx_vpackod_h(in_u, in_v); \ - v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \ - v_od = __lsx_vdp2_w_h(u_od, ugvg); \ - tmp0 = __lsx_vsub_w(y_ev, v_ev); \ - tmp1 = __lsx_vsub_w(y_od, v_od); \ - tmp0 = __lsx_vsrai_w(tmp0, 6); \ - tmp1 = __lsx_vsrai_w(tmp1, 6); \ - tmp0 = __lsx_vclip255_w(tmp0); \ - tmp1 = __lsx_vclip255_w(tmp1); \ - out_g = __lsx_vpackev_h(tmp1, tmp0); \ - } - -// Pack and Store 16 ARGB values. -#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ - { \ - __m128i temp0, temp1, temp2, temp3; \ - temp0 = __lsx_vpackev_b(g_l, b_l); \ - temp1 = __lsx_vpackev_b(a_l, r_l); \ - temp2 = __lsx_vpackev_b(g_h, b_h); \ - temp3 = __lsx_vpackev_b(a_h, r_h); \ - r_l = __lsx_vilvl_h(temp1, temp0); \ - r_h = __lsx_vilvh_h(temp1, temp0); \ - g_l = __lsx_vilvl_h(temp3, temp2); \ - g_h = __lsx_vilvh_h(temp3, temp2); \ - __lsx_vst(r_l, pdst_argb, 0); \ - __lsx_vst(r_h, pdst_argb, 16); \ - __lsx_vst(g_l, pdst_argb, 32); \ - __lsx_vst(g_h, pdst_argb, 48); \ - pdst_argb += 64; \ - } - -// Pack and Store 8 ARGB values. -#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ - { \ - __m128i temp0, temp1; \ - __m128i dst0, dst1; \ - \ - temp0 = __lsx_vpackev_b(in_g, in_b); \ - temp1 = __lsx_vpackev_b(in_a, in_r); \ - dst0 = __lsx_vilvl_h(temp1, temp0); \ - dst1 = __lsx_vilvh_h(temp1, temp0); \ - __lsx_vst(dst0, pdst_argb, 0); \ - __lsx_vst(dst1, pdst_argb, 16); \ - pdst_argb += 32; \ - } - -#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ - { \ - __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ - __m128i _reg0, _reg1; \ - _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \ - _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \ - _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \ - _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \ - _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \ - _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \ - _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \ - _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \ - _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \ - _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \ - _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \ - _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \ - _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ - _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ - _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ - _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ - } - -void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { - int x; - int len = width / 32; - __m128i src0, src1; - __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607}; - src += width - 32; - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); - DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, - src1); - __lsx_vst(src1, dst, 0); - __lsx_vst(src0, dst, 16); - dst += 32; - src -= 32; - } -} - -void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - int x; - int len = width / 8; - __m128i src, dst; - __m128i shuffler = {0x0004000500060007, 0x0000000100020003}; - - src_uv += (width - 8) << 1; - for (x = 0; x < len; x++) { - src = __lsx_vld(src_uv, 0); - dst = __lsx_vshuf_h(shuffler, src, src); - __lsx_vst(dst, dst_uv, 0); - src_uv -= 16; - dst_uv += 16; - } -} - -void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { - int x; - int len = width / 8; - __m128i src0, src1; - __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504}; - - src += (width * 4) - 32; - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); - DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, - src1); - __lsx_vst(src1, dst, 0); - __lsx_vst(src0, dst, 16); - dst += 32; - src -= 32; - } -} - -void I422ToYUY2Row_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - int x; - int len = width / 16; - __m128i src_u0, src_v0, src_y0, vec_uv0; - __m128i vec_yuy2_0, vec_yuy2_1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); - src_y0 = __lsx_vld(src_y, 0); - vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); - vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0); - vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0); - __lsx_vst(vec_yuy2_0, dst_yuy2, 0); - __lsx_vst(vec_yuy2_1, dst_yuy2, 16); - src_u += 8; - src_v += 8; - src_y += 16; - dst_yuy2 += 32; - } -} - -void I422ToUYVYRow_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - int x; - int len = width / 16; - __m128i src_u0, src_v0, src_y0, vec_uv0; - __m128i vec_uyvy0, vec_uyvy1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); - src_y0 = __lsx_vld(src_y, 0); - vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); - vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0); - vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0); - __lsx_vst(vec_uyvy0, dst_uyvy, 0); - __lsx_vst(vec_uyvy1, dst_uyvy, 16); - src_u += 8; - src_v += 8; - src_y += 16; - dst_uyvy += 32; - } -} - -void I422ToARGBRow_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i alpha = __lsx_vldi(0xFF); - __m128i const_80 = __lsx_vldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -void I422ToRGBARow_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i alpha = __lsx_vldi(0xFF); - __m128i const_80 = __lsx_vldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -void I422AlphaToARGBRow_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - int res = width & 15; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i zero = __lsx_vldi(0); - __m128i const_80 = __lsx_vldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; - - y = __lsx_vld(src_a, 0); - a_l = __lsx_vilvl_b(zero, y); - a_h = __lsx_vilvh_b(zero, y); - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); - src_y += 16; - src_u += 8; - src_v += 8; - src_a += 16; - } - if (res) { - __m128i y, uv, r, g, b, a; - a = __lsx_vld(src_a, 0); - a = __lsx_vsllwil_hu_bu(a, 0); - READYUV422(src_y, src_u, src_v, y, uv); - YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); - STOREARGB(a, r, g, b, dst_argb); - } -} - -void I422ToRGB24Row_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int32_t width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i const_80 = __lsx_vldi(0x80); - __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614}; - __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - __m128i temp0, temp1, temp2, temp3; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - temp0 = __lsx_vpackev_b(g_l, b_l); - temp1 = __lsx_vpackev_b(g_h, b_h); - DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l, - temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, - temp1); - - b_l = __lsx_vilvl_d(temp1, temp2); - b_h = __lsx_vilvh_d(temp3, temp1); - __lsx_vst(temp0, dst_argb, 0); - __lsx_vst(b_l, dst_argb, 16); - __lsx_vst(b_h, dst_argb, 32); - dst_argb += 48; - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void I422ToRGB565Row_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i const_80 = __lsx_vldi(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lsx_vsrli_h(b_l, 3); - b_h = __lsx_vsrli_h(b_h, 3); - g_l = __lsx_vsrli_h(g_l, 2); - g_h = __lsx_vsrli_h(g_h, 2); - r_l = __lsx_vsrli_h(r_l, 3); - r_h = __lsx_vsrli_h(r_h, 3); - r_l = __lsx_vslli_h(r_l, 11); - r_h = __lsx_vslli_h(r_h, 11); - g_l = __lsx_vslli_h(g_l, 5); - g_h = __lsx_vslli_h(g_h, 5); - r_l = __lsx_vor_v(r_l, g_l); - r_l = __lsx_vor_v(r_l, b_l); - r_h = __lsx_vor_v(r_h, g_h); - r_h = __lsx_vor_v(r_h, b_h); - __lsx_vst(r_l, dst_rgb565, 0); - __lsx_vst(r_h, dst_rgb565, 16); - dst_rgb565 += 32; - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. -void I422ToARGB4444Row_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i const_80 = __lsx_vldi(0x80); - __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000}; - __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0}; - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lsx_vsrli_h(b_l, 4); - b_h = __lsx_vsrli_h(b_h, 4); - r_l = __lsx_vsrli_h(r_l, 4); - r_h = __lsx_vsrli_h(r_h, 4); - g_l = __lsx_vand_v(g_l, mask); - g_h = __lsx_vand_v(g_h, mask); - r_l = __lsx_vslli_h(r_l, 8); - r_h = __lsx_vslli_h(r_h, 8); - r_l = __lsx_vor_v(r_l, alpha); - r_h = __lsx_vor_v(r_h, alpha); - r_l = __lsx_vor_v(r_l, g_l); - r_h = __lsx_vor_v(r_h, g_h); - r_l = __lsx_vor_v(r_l, b_l); - r_h = __lsx_vor_v(r_h, b_h); - __lsx_vst(r_l, dst_argb4444, 0); - __lsx_vst(r_h, dst_argb4444, 16); - dst_argb4444 += 32; - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -void I422ToARGB1555Row_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; - __m128i vec_ubvr, vec_ugvg; - __m128i const_80 = __lsx_vldi(0x80); - __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000}; - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; - - READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); - YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, - g_h, r_l, r_h); - b_l = __lsx_vsrli_h(b_l, 3); - b_h = __lsx_vsrli_h(b_h, 3); - g_l = __lsx_vsrli_h(g_l, 3); - - g_h = __lsx_vsrli_h(g_h, 3); - g_l = __lsx_vslli_h(g_l, 5); - g_h = __lsx_vslli_h(g_h, 5); - r_l = __lsx_vsrli_h(r_l, 3); - r_h = __lsx_vsrli_h(r_h, 3); - r_l = __lsx_vslli_h(r_l, 10); - r_h = __lsx_vslli_h(r_h, 10); - r_l = __lsx_vor_v(r_l, alpha); - r_h = __lsx_vor_v(r_h, alpha); - r_l = __lsx_vor_v(r_l, g_l); - r_h = __lsx_vor_v(r_h, g_h); - r_l = __lsx_vor_v(r_l, b_l); - r_h = __lsx_vor_v(r_h, b_h); - __lsx_vst(r_l, dst_argb1555, 0); - __lsx_vst(r_h, dst_argb1555, 16); - dst_argb1555 += 32; - src_y += 16; - src_u += 8; - src_v += 8; - } -} - -void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); - dst0 = __lsx_vpickev_b(src1, src0); - __lsx_vst(dst0, dst_y, 0); - src_yuy2 += 32; - dst_y += 16; - } -} - -void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; - int x; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0, - src_yuy2_next, 16, src0, src1, src2, src3); - src0 = __lsx_vpickod_b(src1, src0); - src1 = __lsx_vpickod_b(src3, src2); - tmp0 = __lsx_vavgr_bu(src1, src0); - dst0 = __lsx_vpickev_b(tmp0, tmp0); - dst1 = __lsx_vpickod_b(tmp0, tmp0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst1, dst_v, 0, 0); - src_yuy2 += 32; - src_yuy2_next += 32; - dst_u += 8; - dst_v += 8; - } -} - -void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); - tmp0 = __lsx_vpickod_b(src1, src0); - dst0 = __lsx_vpickev_b(tmp0, tmp0); - dst1 = __lsx_vpickod_b(tmp0, tmp0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst1, dst_v, 0, 0); - src_yuy2 += 32; - dst_u += 8; - dst_v += 8; - } -} - -void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); - dst0 = __lsx_vpickod_b(src1, src0); - __lsx_vst(dst0, dst_y, 0); - src_uyvy += 32; - dst_y += 16; - } -} - -void UYVYToUVRow_LSX(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0, - src_uyvy_next, 16, src0, src1, src2, src3); - src0 = __lsx_vpickev_b(src1, src0); - src1 = __lsx_vpickev_b(src3, src2); - tmp0 = __lsx_vavgr_bu(src1, src0); - dst0 = __lsx_vpickev_b(tmp0, tmp0); - dst1 = __lsx_vpickod_b(tmp0, tmp0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst1, dst_v, 0, 0); - src_uyvy += 32; - src_uyvy_next += 32; - dst_u += 8; - dst_v += 8; - } -} - -void UYVYToUV422Row_LSX(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, tmp0, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - dst0 = __lsx_vpickev_b(tmp0, tmp0); - dst1 = __lsx_vpickod_b(tmp0, tmp0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst1, dst_v, 0, 0); - src_uyvy += 32; - dst_u += 8; - dst_v += 8; - } -} - -void ARGBToUVRow_LSX(const uint8_t* src_argb0, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 16; - const uint8_t* src_argb1 = src_argb0 + src_stride_argb; - - __m128i src0, src1, src2, src3, src4, src5, src6, src7; - __m128i vec0, vec1, vec2, vec3; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; - __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038}; - __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025}; - __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013}; - __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f}; - __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009}; - __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0, - 48, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1, - 48, src4, src5, src6, src7); - vec0 = __lsx_vaddwev_h_bu(src0, src4); - vec1 = __lsx_vaddwev_h_bu(src1, src5); - vec2 = __lsx_vaddwev_h_bu(src2, src6); - vec3 = __lsx_vaddwev_h_bu(src3, src7); - tmp0 = __lsx_vpickev_h(vec1, vec0); - tmp1 = __lsx_vpickev_h(vec3, vec2); - tmp2 = __lsx_vpickod_h(vec1, vec0); - tmp3 = __lsx_vpickod_h(vec3, vec2); - vec0 = __lsx_vaddwod_h_bu(src0, src4); - vec1 = __lsx_vaddwod_h_bu(src1, src5); - vec2 = __lsx_vaddwod_h_bu(src2, src6); - vec3 = __lsx_vaddwod_h_bu(src3, src7); - tmp4 = __lsx_vpickev_h(vec1, vec0); - tmp5 = __lsx_vpickev_h(vec3, vec2); - vec0 = __lsx_vpickev_h(tmp1, tmp0); - vec1 = __lsx_vpickod_h(tmp1, tmp0); - src0 = __lsx_vavgr_h(vec0, vec1); - vec0 = __lsx_vpickev_h(tmp3, tmp2); - vec1 = __lsx_vpickod_h(tmp3, tmp2); - src1 = __lsx_vavgr_h(vec0, vec1); - vec0 = __lsx_vpickev_h(tmp5, tmp4); - vec1 = __lsx_vpickod_h(tmp5, tmp4); - src2 = __lsx_vavgr_h(vec0, vec1); - dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70); - dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A); - dst0 = __lsx_vmsub_h(dst0, src1, const_0x26); - dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70); - dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E); - dst1 = __lsx_vmsub_h(dst1, src0, const_0x12); - dst0 = __lsx_vsrai_h(dst0, 8); - dst1 = __lsx_vsrai_h(dst1, 8); - dst0 = __lsx_vpickev_b(dst1, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - src_argb0 += 64; - src_argb1 += 64; - dst_u += 8; - dst_v += 8; - } -} - -void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - int len = (width / 16) - 1; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vshuf_b(src0, src0, shuf); - tmp1 = __lsx_vshuf_b(src1, src1, shuf); - tmp2 = __lsx_vshuf_b(src2, src2, shuf); - tmp3 = __lsx_vshuf_b(src3, src3, shuf); - __lsx_vst(tmp0, dst_rgb, 0); - __lsx_vst(tmp1, dst_rgb, 12); - __lsx_vst(tmp2, dst_rgb, 24); - __lsx_vst(tmp3, dst_rgb, 36); - dst_rgb += 48; - src_argb += 64; - } - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vshuf_b(src0, src0, shuf); - tmp1 = __lsx_vshuf_b(src1, src1, shuf); - tmp2 = __lsx_vshuf_b(src2, src2, shuf); - tmp3 = __lsx_vshuf_b(src3, src3, shuf); - __lsx_vst(tmp0, dst_rgb, 0); - __lsx_vst(tmp1, dst_rgb, 12); - __lsx_vst(tmp2, dst_rgb, 24); - dst_rgb += 36; - __lsx_vst(tmp3, dst_rgb, 0); -} - -void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - int len = (width / 16) - 1; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vshuf_b(src0, src0, shuf); - tmp1 = __lsx_vshuf_b(src1, src1, shuf); - tmp2 = __lsx_vshuf_b(src2, src2, shuf); - tmp3 = __lsx_vshuf_b(src3, src3, shuf); - __lsx_vst(tmp0, dst_rgb, 0); - __lsx_vst(tmp1, dst_rgb, 12); - __lsx_vst(tmp2, dst_rgb, 24); - __lsx_vst(tmp3, dst_rgb, 36); - dst_rgb += 48; - src_argb += 64; - } - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vshuf_b(src0, src0, shuf); - tmp1 = __lsx_vshuf_b(src1, src1, shuf); - tmp2 = __lsx_vshuf_b(src2, src2, shuf); - tmp3 = __lsx_vshuf_b(src3, src3, shuf); - __lsx_vst(tmp0, dst_rgb, 0); - __lsx_vst(tmp1, dst_rgb, 12); - __lsx_vst(tmp2, dst_rgb, 24); - dst_rgb += 36; - __lsx_vst(tmp3, dst_rgb, 0); -} - -void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - int len = width / 8; - __m128i zero = __lsx_vldi(0); - __m128i src0, src1, tmp0, tmp1, dst0; - __m128i shift = {0x0300030003000300, 0x0300030003000300}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp0 = __lsx_vsrli_b(tmp0, 3); - tmp1 = __lsx_vpackev_b(zero, tmp1); - tmp1 = __lsx_vsrli_h(tmp1, 2); - tmp0 = __lsx_vsll_b(tmp0, shift); - tmp1 = __lsx_vslli_h(tmp1, 5); - dst0 = __lsx_vor_v(tmp0, tmp1); - __lsx_vst(dst0, dst_rgb, 0); - dst_rgb += 16; - src_argb += 32; - } -} - -void ARGBToARGB1555Row_LSX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - int len = width / 8; - __m128i zero = __lsx_vldi(0); - __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; - __m128i shift1 = {0x0703070307030703, 0x0703070307030703}; - __m128i shift2 = {0x0200020002000200, 0x0200020002000200}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp0 = __lsx_vsrli_b(tmp0, 3); - tmp1 = __lsx_vsrl_b(tmp1, shift1); - tmp0 = __lsx_vsll_b(tmp0, shift2); - tmp2 = __lsx_vpackev_b(zero, tmp1); - tmp3 = __lsx_vpackod_b(zero, tmp1); - tmp2 = __lsx_vslli_h(tmp2, 5); - tmp3 = __lsx_vslli_h(tmp3, 15); - dst0 = __lsx_vor_v(tmp0, tmp2); - dst0 = __lsx_vor_v(dst0, tmp3); - __lsx_vst(dst0, dst_rgb, 0); - dst_rgb += 16; - src_argb += 32; - } -} - -void ARGBToARGB4444Row_LSX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vandi_b(tmp1, 0xF0); - tmp0 = __lsx_vsrli_b(tmp0, 4); - dst0 = __lsx_vor_v(tmp1, tmp0); - __lsx_vst(dst0, dst_rgb, 0); - dst_rgb += 16; - src_argb += 32; - } -} - -void ARGBToUV444Row_LSX(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int32_t width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, reg2, reg3, dst0, dst1; - __m128i const_112 = __lsx_vldi(112); - __m128i const_74 = __lsx_vldi(74); - __m128i const_38 = __lsx_vldi(38); - __m128i const_94 = __lsx_vldi(94); - __m128i const_18 = __lsx_vldi(18); - __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickev_h(src1, src0); - tmp1 = __lsx_vpickod_h(src1, src0); - tmp2 = __lsx_vpickev_h(src3, src2); - tmp3 = __lsx_vpickod_h(src3, src2); - reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112); - reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112); - reg2 = __lsx_vmulwod_h_bu(tmp0, const_74); - reg3 = __lsx_vmulwod_h_bu(tmp2, const_74); - reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38); - reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38); - reg0 = __lsx_vsub_h(reg0, reg2); - reg1 = __lsx_vsub_h(reg1, reg3); - reg0 = __lsx_vsrai_h(reg0, 8); - reg1 = __lsx_vsrai_h(reg1, 8); - dst0 = __lsx_vpickev_b(reg1, reg0); - - reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112); - reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112); - reg2 = __lsx_vmulwev_h_bu(tmp0, const_18); - reg3 = __lsx_vmulwev_h_bu(tmp2, const_18); - reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94); - reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94); - reg0 = __lsx_vsub_h(reg0, reg2); - reg1 = __lsx_vsub_h(reg1, reg3); - reg0 = __lsx_vsrai_h(reg0, 8); - reg1 = __lsx_vsrai_h(reg1, 8); - dst1 = __lsx_vpickev_b(reg1, reg0); - - __lsx_vst(dst0, dst_u, 0); - __lsx_vst(dst1, dst_v, 0); - dst_u += 16; - dst_v += 16; - src_argb += 64; - } -} - -void ARGBMultiplyRow_LSX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 4; - __m128i zero = __lsx_vldi(0); - __m128i src0, src1, dst0, dst1; - __m128i tmp0, tmp1, tmp2, tmp3; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); - tmp0 = __lsx_vilvl_b(src0, src0); - tmp1 = __lsx_vilvh_b(src0, src0); - tmp2 = __lsx_vilvl_b(zero, src1); - tmp3 = __lsx_vilvh_b(zero, src1); - dst0 = __lsx_vmuh_hu(tmp0, tmp2); - dst1 = __lsx_vmuh_hu(tmp1, tmp3); - dst0 = __lsx_vpickev_b(dst1, dst0); - __lsx_vst(dst0, dst_argb, 0); - src_argb0 += 16; - src_argb1 += 16; - dst_argb += 16; - } -} - -void ARGBAddRow_LSX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 4; - __m128i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); - dst0 = __lsx_vsadd_bu(src0, src1); - __lsx_vst(dst0, dst_argb, 0); - src_argb0 += 16; - src_argb1 += 16; - dst_argb += 16; - } -} - -void ARGBSubtractRow_LSX(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 4; - __m128i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); - dst0 = __lsx_vssub_bu(src0, src1); - __lsx_vst(dst0, dst_argb, 0); - src_argb0 += 16; - src_argb1 += 16; - dst_argb += 16; - } -} - -void ARGBAttenuateRow_LSX(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1; - __m128i reg0, reg1, reg2, reg3, reg4, reg5; - __m128i b, g, r, a, dst0, dst1; - __m128i control = {0x0005000100040000, 0x0007000300060002}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - b = __lsx_vpackev_b(tmp0, tmp0); - r = __lsx_vpackod_b(tmp0, tmp0); - g = __lsx_vpackev_b(tmp1, tmp1); - a = __lsx_vpackod_b(tmp1, tmp1); - reg0 = __lsx_vmulwev_w_hu(b, a); - reg1 = __lsx_vmulwod_w_hu(b, a); - reg2 = __lsx_vmulwev_w_hu(r, a); - reg3 = __lsx_vmulwod_w_hu(r, a); - reg4 = __lsx_vmulwev_w_hu(g, a); - reg5 = __lsx_vmulwod_w_hu(g, a); - reg0 = __lsx_vssrani_h_w(reg1, reg0, 24); - reg2 = __lsx_vssrani_h_w(reg3, reg2, 24); - reg4 = __lsx_vssrani_h_w(reg5, reg4, 24); - reg0 = __lsx_vshuf_h(control, reg0, reg0); - reg2 = __lsx_vshuf_h(control, reg2, reg2); - reg4 = __lsx_vshuf_h(control, reg4, reg4); - tmp0 = __lsx_vpackev_b(reg4, reg0); - tmp1 = __lsx_vpackev_b(a, reg2); - dst0 = __lsx_vilvl_h(tmp1, tmp0); - dst1 = __lsx_vilvh_h(tmp1, tmp0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - dst_argb += 32; - src_argb += 32; - } -} - -void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1, dst0; - __m128i b, g, r; - __m128i zero = __lsx_vldi(0); - __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0); - - vec_dither = __lsx_vilvl_b(zero, vec_dither); - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - b = __lsx_vpackev_b(zero, tmp0); - r = __lsx_vpackod_b(zero, tmp0); - g = __lsx_vpackev_b(zero, tmp1); - b = __lsx_vadd_h(b, vec_dither); - g = __lsx_vadd_h(g, vec_dither); - r = __lsx_vadd_h(r, vec_dither); - DUP2_ARG1(__lsx_vclip255_h, b, g, b, g); - r = __lsx_vclip255_h(r); - b = __lsx_vsrai_h(b, 3); - g = __lsx_vsrai_h(g, 2); - r = __lsx_vsrai_h(r, 3); - g = __lsx_vslli_h(g, 5); - r = __lsx_vslli_h(r, 11); - dst0 = __lsx_vor_v(b, g); - dst0 = __lsx_vor_v(dst0, r); - __lsx_vst(dst0, dst_rgb, 0); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBShuffleRow_LSX(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, dst0, dst1; - __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808}; - __m128i temp = __lsx_vldrepl_w(shuffler, 0); - - shuf = __lsx_vadd_b(shuf, temp); - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - dst0 = __lsx_vshuf_b(src0, src0, shuf); - dst1 = __lsx_vshuf_b(src1, src1, shuf); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBShadeRow_LSX(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - int x; - int len = width / 4; - __m128i src0, dst0, tmp0, tmp1; - __m128i vec_value = __lsx_vreplgr2vr_w(value); - - vec_value = __lsx_vilvl_b(vec_value, vec_value); - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_argb, 0); - tmp0 = __lsx_vilvl_b(src0, src0); - tmp1 = __lsx_vilvh_b(src0, src0); - tmp0 = __lsx_vmuh_hu(tmp0, vec_value); - tmp1 = __lsx_vmuh_hu(tmp1, vec_value); - dst0 = __lsx_vpickod_b(tmp1, tmp0); - __lsx_vst(dst0, dst_argb, 0); - src_argb += 16; - dst_argb += 16; - } -} - -void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1; - __m128i reg0, reg1, reg2, dst0, dst1; - __m128i const_128 = __lsx_vldi(0x480); - __m128i const_150 = __lsx_vldi(0x96); - __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - reg0 = __lsx_vdp2_h_bu(tmp0, const_br); - reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); - reg2 = __lsx_vadd_h(reg0, reg1); - tmp0 = __lsx_vpackod_b(reg2, reg2); - tmp1 = __lsx_vpackod_b(tmp1, reg2); - dst0 = __lsx_vilvl_h(tmp1, tmp0); - dst1 = __lsx_vilvh_h(tmp1, tmp0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1; - __m128i reg0, reg1, spb, spg, spr; - __m128i dst0, dst1; - __m128i spb_g = __lsx_vldi(68); - __m128i spg_g = __lsx_vldi(88); - __m128i spr_g = __lsx_vldi(98); - __m128i spb_br = {0x2311231123112311, 0x2311231123112311}; - __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16}; - __m128i spr_br = {0x3218321832183218, 0x3218321832183218}; - __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); - spr = __lsx_vdp2_h_bu(tmp0, spr_br); - spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g); - spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g); - spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g); - spb = __lsx_vsrli_h(spb, 7); - spg = __lsx_vsrli_h(spg, 7); - spr = __lsx_vsrli_h(spr, 7); - spg = __lsx_vsat_hu(spg, 7); - spr = __lsx_vsat_hu(spr, 7); - reg0 = __lsx_vpackev_b(spg, spb); - reg1 = __lsx_vshuf_b(tmp1, spr, shuff); - dst0 = __lsx_vilvl_h(reg1, reg0); - dst1 = __lsx_vilvh_h(reg1, reg0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src0, src1; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, reg2, reg3; - __m128i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_argb4444, 0); - src1 = __lsx_vld(src_argb4444, 16); - tmp0 = __lsx_vandi_b(src0, 0x0F); - tmp1 = __lsx_vandi_b(src0, 0xF0); - tmp2 = __lsx_vandi_b(src1, 0x0F); - tmp3 = __lsx_vandi_b(src1, 0xF0); - reg0 = __lsx_vslli_b(tmp0, 4); - reg2 = __lsx_vslli_b(tmp2, 4); - reg1 = __lsx_vsrli_b(tmp1, 4); - reg3 = __lsx_vsrli_b(tmp3, 4); - DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0, - tmp1, tmp2, tmp3); - dst0 = __lsx_vilvl_b(tmp1, tmp0); - dst2 = __lsx_vilvl_b(tmp3, tmp2); - dst1 = __lsx_vilvh_b(tmp1, tmp0); - dst3 = __lsx_vilvh_b(tmp3, tmp2); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_argb4444 += 32; - } -} - -void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src0, src1; - __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; - __m128i reg0, reg1, reg2; - __m128i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_argb1555, 0); - src1 = __lsx_vld(src_argb1555, 16); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - tmpg = __lsx_vsrli_b(tmp0, 5); - reg0 = __lsx_vandi_b(tmp1, 0x03); - reg0 = __lsx_vslli_b(reg0, 3); - tmpg = __lsx_vor_v(tmpg, reg0); - reg1 = __lsx_vandi_b(tmp1, 0x7C); - tmpr = __lsx_vsrli_b(reg1, 2); - tmpa = __lsx_vsrli_b(tmp1, 7); - tmpa = __lsx_vneg_b(tmpa); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vslli_b(tmpg, 3); - reg2 = __lsx_vslli_b(tmpr, 3); - tmpb = __lsx_vsrli_b(tmpb, 2); - tmpg = __lsx_vsrli_b(tmpg, 2); - tmpr = __lsx_vsrli_b(tmpr, 2); - tmpb = __lsx_vor_v(reg0, tmpb); - tmpg = __lsx_vor_v(reg1, tmpg); - tmpr = __lsx_vor_v(reg2, tmpr); - DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); - dst0 = __lsx_vilvl_h(reg1, reg0); - dst1 = __lsx_vilvh_h(reg1, reg0); - DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); - dst2 = __lsx_vilvl_h(reg1, reg0); - dst3 = __lsx_vilvh_h(reg1, reg0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_argb1555 += 32; - } -} - -void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src0, src1; - __m128i tmp0, tmp1, tmpb, tmpg, tmpr; - __m128i reg0, reg1, dst0, dst1, dst2, dst3; - __m128i alpha = __lsx_vldi(0xFF); - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_rgb565, 0); - src1 = __lsx_vld(src_rgb565, 16); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - tmpr = __lsx_vandi_b(tmp1, 0xF8); - reg1 = __lsx_vandi_b(tmp1, 0x07); - reg0 = __lsx_vsrli_b(tmp0, 5); - reg1 = __lsx_vslli_b(reg1, 3); - tmpg = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vsrli_b(tmpb, 2); - tmpb = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vslli_b(tmpg, 2); - reg1 = __lsx_vsrli_b(tmpg, 4); - tmpg = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vsrli_b(tmpr, 5); - tmpr = __lsx_vor_v(tmpr, reg0); - DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); - dst0 = __lsx_vilvl_h(reg1, reg0); - dst1 = __lsx_vilvh_h(reg1, reg0); - DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); - dst2 = __lsx_vilvl_h(reg1, reg0); - dst3 = __lsx_vilvh_h(reg1, reg0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_rgb565 += 32; - } -} - -void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2; - __m128i tmp0, tmp1, tmp2; - __m128i dst0, dst1, dst2, dst3; - __m128i alpha = __lsx_vldi(0xFF); - __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; - __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; - __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; - __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_rgb24, 0); - src1 = __lsx_vld(src_rgb24, 16); - src2 = __lsx_vld(src_rgb24, 32); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); - tmp2 = __lsx_vshuf_b(src1, src2, shuf2); - DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, - tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_rgb24 += 48; - } -} - -void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2; - __m128i tmp0, tmp1, tmp2; - __m128i dst0, dst1, dst2, dst3; - __m128i alpha = __lsx_vldi(0xFF); - __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; - __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; - __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; - __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_raw, 0); - src1 = __lsx_vld(src_raw, 16); - src2 = __lsx_vld(src_raw, 32); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); - tmp2 = __lsx_vshuf_b(src1, src2, shuf2); - DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, - tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_raw += 48; - } -} - -void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - int x; - int len = width / 16; - __m128i src0, src1; - __m128i tmp0, tmp1, tmpb, tmpg, tmpr; - __m128i reg0, reg1, reg2, dst0; - __m128i const_66 = __lsx_vldi(66); - __m128i const_129 = __lsx_vldi(129); - __m128i const_25 = __lsx_vldi(25); - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_argb1555, 0); - src1 = __lsx_vld(src_argb1555, 16); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - tmpg = __lsx_vsrli_b(tmp0, 5); - reg0 = __lsx_vandi_b(tmp1, 0x03); - reg0 = __lsx_vslli_b(reg0, 3); - tmpg = __lsx_vor_v(tmpg, reg0); - reg1 = __lsx_vandi_b(tmp1, 0x7C); - tmpr = __lsx_vsrli_b(reg1, 2); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vslli_b(tmpg, 3); - reg2 = __lsx_vslli_b(tmpr, 3); - tmpb = __lsx_vsrli_b(tmpb, 2); - tmpg = __lsx_vsrli_b(tmpg, 2); - tmpr = __lsx_vsrli_b(tmpr, 2); - tmpb = __lsx_vor_v(reg0, tmpb); - tmpg = __lsx_vor_v(reg1, tmpg); - tmpr = __lsx_vor_v(reg2, tmpr); - reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); - reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); - reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); - reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); - reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); - reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lsx_vpackod_b(reg1, reg0); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_argb1555 += 32; - } -} - -void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 16; - const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i reg0, reg1, reg2, reg3, dst0; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0, - next_argb1555, 16, src0, src1, src2, src3); - DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); - DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - nexb = __lsx_vandi_b(tmp2, 0x1F); - tmpg = __lsx_vsrli_b(tmp0, 5); - nexg = __lsx_vsrli_b(tmp2, 5); - reg0 = __lsx_vandi_b(tmp1, 0x03); - reg2 = __lsx_vandi_b(tmp3, 0x03); - reg0 = __lsx_vslli_b(reg0, 3); - reg2 = __lsx_vslli_b(reg2, 3); - tmpg = __lsx_vor_v(tmpg, reg0); - nexg = __lsx_vor_v(nexg, reg2); - reg1 = __lsx_vandi_b(tmp1, 0x7C); - reg3 = __lsx_vandi_b(tmp3, 0x7C); - tmpr = __lsx_vsrli_b(reg1, 2); - nexr = __lsx_vsrli_b(reg3, 2); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vslli_b(tmpg, 3); - reg2 = __lsx_vslli_b(tmpr, 3); - tmpb = __lsx_vsrli_b(tmpb, 2); - tmpg = __lsx_vsrli_b(tmpg, 2); - tmpr = __lsx_vsrli_b(tmpr, 2); - tmpb = __lsx_vor_v(reg0, tmpb); - tmpg = __lsx_vor_v(reg1, tmpg); - tmpr = __lsx_vor_v(reg2, tmpr); - reg0 = __lsx_vslli_b(nexb, 3); - reg1 = __lsx_vslli_b(nexg, 3); - reg2 = __lsx_vslli_b(nexr, 3); - nexb = __lsx_vsrli_b(nexb, 2); - nexg = __lsx_vsrli_b(nexg, 2); - nexr = __lsx_vsrli_b(nexr, 2); - nexb = __lsx_vor_v(reg0, nexb); - nexg = __lsx_vor_v(reg1, nexg); - nexr = __lsx_vor_v(reg2, nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_argb1555 += 32; - next_argb1555 += 32; - } -} - -void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1; - __m128i tmp0, tmp1, tmpb, tmpg, tmpr; - __m128i reg0, reg1, dst0; - __m128i const_66 = __lsx_vldi(66); - __m128i const_129 = __lsx_vldi(129); - __m128i const_25 = __lsx_vldi(25); - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_rgb565, 0); - src1 = __lsx_vld(src_rgb565, 16); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - tmpr = __lsx_vandi_b(tmp1, 0xF8); - reg1 = __lsx_vandi_b(tmp1, 0x07); - reg0 = __lsx_vsrli_b(tmp0, 5); - reg1 = __lsx_vslli_b(reg1, 3); - tmpg = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vsrli_b(tmpb, 2); - tmpb = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vslli_b(tmpg, 2); - reg1 = __lsx_vsrli_b(tmpg, 4); - tmpg = __lsx_vor_v(reg1, reg0); - reg0 = __lsx_vsrli_b(tmpr, 5); - tmpr = __lsx_vor_v(tmpr, reg0); - reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); - reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); - reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); - reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); - reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); - reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lsx_vpackod_b(reg1, reg0); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_rgb565 += 32; - } -} - -void RGB565ToUVRow_LSX(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 16; - const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i reg0, reg1, reg2, reg3, dst0; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0, - next_rgb565, 16, src0, src1, src2, src3); - DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); - DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); - tmpb = __lsx_vandi_b(tmp0, 0x1F); - tmpr = __lsx_vandi_b(tmp1, 0xF8); - nexb = __lsx_vandi_b(tmp2, 0x1F); - nexr = __lsx_vandi_b(tmp3, 0xF8); - reg1 = __lsx_vandi_b(tmp1, 0x07); - reg3 = __lsx_vandi_b(tmp3, 0x07); - reg0 = __lsx_vsrli_b(tmp0, 5); - reg1 = __lsx_vslli_b(reg1, 3); - reg2 = __lsx_vsrli_b(tmp2, 5); - reg3 = __lsx_vslli_b(reg3, 3); - tmpg = __lsx_vor_v(reg1, reg0); - nexg = __lsx_vor_v(reg2, reg3); - reg0 = __lsx_vslli_b(tmpb, 3); - reg1 = __lsx_vsrli_b(tmpb, 2); - reg2 = __lsx_vslli_b(nexb, 3); - reg3 = __lsx_vsrli_b(nexb, 2); - tmpb = __lsx_vor_v(reg1, reg0); - nexb = __lsx_vor_v(reg2, reg3); - reg0 = __lsx_vslli_b(tmpg, 2); - reg1 = __lsx_vsrli_b(tmpg, 4); - reg2 = __lsx_vslli_b(nexg, 2); - reg3 = __lsx_vsrli_b(nexg, 4); - tmpg = __lsx_vor_v(reg1, reg0); - nexg = __lsx_vor_v(reg2, reg3); - reg0 = __lsx_vsrli_b(tmpr, 5); - reg2 = __lsx_vsrli_b(nexr, 5); - tmpr = __lsx_vor_v(tmpr, reg0); - nexr = __lsx_vor_v(nexr, reg2); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_rgb565 += 32; - next_rgb565 += 32; - } -} - -void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; - int len = width / 16; - __m128i src0, src1, src2; - __m128i nex0, nex1, nex2, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18}; - __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908}; - __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; - __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; - __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A}; - __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_rgb24, 0); - src1 = __lsx_vld(src_rgb24, 16); - src2 = __lsx_vld(src_rgb24, 32); - nex0 = __lsx_vld(next_rgb24, 0); - nex1 = __lsx_vld(next_rgb24, 16); - nex2 = __lsx_vld(next_rgb24, 32); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, - nexb); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, - nexg); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, - nexr); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, - nexb); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, - nexg); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, - nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_rgb24 += 48; - next_rgb24 += 48; - } -} - -void RAWToUVRow_LSX(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_raw = src_raw + src_stride_raw; - int len = width / 16; - __m128i src0, src1, src2; - __m128i nex0, nex1, nex2, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18}; - __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908}; - __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; - __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; - __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A}; - __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_raw, 0); - src1 = __lsx_vld(src_raw, 16); - src2 = __lsx_vld(src_raw, 32); - nex0 = __lsx_vld(next_raw, 0); - nex1 = __lsx_vld(next_raw, 16); - nex2 = __lsx_vld(next_raw, 32); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, - nexb); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, - nexg); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, - nexr); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, - nexb); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, - nexg); - DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, - nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_raw += 48; - next_raw += 48; - } -} - -void NV12ToARGBRow_LSX(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 8; - __m128i vec_y, vec_vu; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; - __m128i vec_vrub, vec_vgug; - __m128i out_b, out_g, out_r; - __m128i const_80 = __lsx_vldi(0x480); - __m128i alpha = __lsx_vldi(0xFF); - __m128i zero = __lsx_vldi(0); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); - vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - vec_vu = __lsx_vld(src_uv, 0); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, - out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_y += 8; - src_uv += 8; - } -} - -void NV12ToRGB565Row_LSX(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 8; - __m128i vec_y, vec_vu; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; - __m128i vec_vrub, vec_vgug; - __m128i out_b, out_g, out_r; - __m128i const_80 = __lsx_vldi(0x480); - __m128i zero = __lsx_vldi(0); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); - vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - vec_vu = __lsx_vld(src_uv, 0); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, - out_r); - out_b = __lsx_vsrli_h(out_b, 3); - out_g = __lsx_vsrli_h(out_g, 2); - out_r = __lsx_vsrli_h(out_r, 3); - out_g = __lsx_vslli_h(out_g, 5); - out_r = __lsx_vslli_h(out_r, 11); - out_r = __lsx_vor_v(out_r, out_g); - out_r = __lsx_vor_v(out_r, out_b); - __lsx_vst(out_r, dst_rgb565, 0); - src_y += 8; - src_uv += 8; - dst_rgb565 += 16; - } -} - -void NV21ToARGBRow_LSX(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 8; - __m128i vec_y, vec_uv; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; - __m128i vec_ubvr, vec_ugvg; - __m128i out_b, out_g, out_r; - __m128i const_80 = __lsx_vldi(0x480); - __m128i alpha = __lsx_vldi(0xFF); - __m128i zero = __lsx_vldi(0); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - vec_uv = __lsx_vld(src_vu, 0); - YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g, - out_b); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_y += 8; - src_vu += 8; - } -} - -void SobelRow_LSX(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, tmp0; - __m128i out0, out1, out2, out3; - __m128i alpha = __lsx_vldi(0xFF); - __m128i shuff0 = {0x1001010110000000, 0x1003030310020202}; - __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04); - __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04); - __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04); - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_sobelx, 0); - src1 = __lsx_vld(src_sobely, 0); - tmp0 = __lsx_vsadd_bu(src0, src1); - DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha, - tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3); - __lsx_vst(out0, dst_argb, 0); - __lsx_vst(out1, dst_argb, 16); - __lsx_vst(out2, dst_argb, 32); - __lsx_vst(out3, dst_argb, 48); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void SobelToPlaneRow_LSX(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - int x; - int len = width / 32; - __m128i src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1); - DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3); - dst0 = __lsx_vsadd_bu(src0, src2); - dst1 = __lsx_vsadd_bu(src1, src3); - __lsx_vst(dst0, dst_y, 0); - __lsx_vst(dst1, dst_y, 16); - src_sobelx += 32; - src_sobely += 32; - dst_y += 32; - } -} - -void SobelXYRow_LSX(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 16; - __m128i src_r, src_b, src_g; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i dst0, dst1, dst2, dst3; - __m128i alpha = __lsx_vldi(0xFF); - - for (x = 0; x < len; x++) { - src_r = __lsx_vld(src_sobelx, 0); - src_b = __lsx_vld(src_sobely, 0); - src_g = __lsx_vsadd_bu(src_r, src_b); - tmp0 = __lsx_vilvl_b(src_g, src_b); - tmp1 = __lsx_vilvh_b(src_g, src_b); - tmp2 = __lsx_vilvl_b(alpha, src_r); - tmp3 = __lsx_vilvh_b(alpha, src_r); - dst0 = __lsx_vilvl_h(tmp2, tmp0); - dst1 = __lsx_vilvh_h(tmp2, tmp0); - dst2 = __lsx_vilvl_h(tmp3, tmp1); - dst3 = __lsx_vilvh_h(tmp3, tmp1); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void BGRAToUVRow_LSX(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_bgra = src_bgra + src_stride_bgra; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i nex0, nex1, nex2, nex3; - __m128i tmp0, tmp1, tmp2, tmp3, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra, - 48, nex0, nex1, nex2, nex3); - tmp0 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vpickev_b(src1, src0); - tmp2 = __lsx_vpickod_b(src3, src2); - tmp3 = __lsx_vpickev_b(src3, src2); - tmpb = __lsx_vpickod_b(tmp2, tmp0); - tmpr = __lsx_vpickev_b(tmp2, tmp0); - tmpg = __lsx_vpickod_b(tmp3, tmp1); - tmp0 = __lsx_vpickod_b(nex1, nex0); - tmp1 = __lsx_vpickev_b(nex1, nex0); - tmp2 = __lsx_vpickod_b(nex3, nex2); - tmp3 = __lsx_vpickev_b(nex3, nex2); - nexb = __lsx_vpickod_b(tmp2, tmp0); - nexr = __lsx_vpickev_b(tmp2, tmp0); - nexg = __lsx_vpickod_b(tmp3, tmp1); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_bgra += 64; - next_bgra += 64; - } -} - -void ABGRToUVRow_LSX(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_abgr = src_abgr + src_stride_abgr; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i nex0, nex1, nex2, nex3; - __m128i tmp0, tmp1, tmp2, tmp3, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr, - 48, nex0, nex1, nex2, nex3); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp2 = __lsx_vpickev_b(src3, src2); - tmp3 = __lsx_vpickod_b(src3, src2); - tmpb = __lsx_vpickod_b(tmp2, tmp0); - tmpr = __lsx_vpickev_b(tmp2, tmp0); - tmpg = __lsx_vpickev_b(tmp3, tmp1); - tmp0 = __lsx_vpickev_b(nex1, nex0); - tmp1 = __lsx_vpickod_b(nex1, nex0); - tmp2 = __lsx_vpickev_b(nex3, nex2); - tmp3 = __lsx_vpickod_b(nex3, nex2); - nexb = __lsx_vpickod_b(tmp2, tmp0); - nexr = __lsx_vpickev_b(tmp2, tmp0); - nexg = __lsx_vpickev_b(tmp3, tmp1); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_abgr += 64; - next_abgr += 64; - } -} - -void RGBAToUVRow_LSX(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_rgba = src_rgba + src_stride_rgba; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i nex0, nex1, nex2, nex3; - __m128i tmp0, tmp1, tmp2, tmp3, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_112 = __lsx_vldi(0x438); - __m128i const_74 = __lsx_vldi(0x425); - __m128i const_38 = __lsx_vldi(0x413); - __m128i const_94 = __lsx_vldi(0x42F); - __m128i const_18 = __lsx_vldi(0x409); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba, - 48, nex0, nex1, nex2, nex3); - tmp0 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vpickev_b(src1, src0); - tmp2 = __lsx_vpickod_b(src3, src2); - tmp3 = __lsx_vpickev_b(src3, src2); - tmpr = __lsx_vpickod_b(tmp2, tmp0); - tmpb = __lsx_vpickev_b(tmp2, tmp0); - tmpg = __lsx_vpickod_b(tmp3, tmp1); - tmp0 = __lsx_vpickod_b(nex1, nex0); - tmp1 = __lsx_vpickev_b(nex1, nex0); - tmp2 = __lsx_vpickod_b(nex3, nex2); - tmp3 = __lsx_vpickev_b(nex3, nex2); - nexr = __lsx_vpickod_b(tmp2, tmp0); - nexb = __lsx_vpickev_b(tmp2, tmp0); - nexg = __lsx_vpickod_b(tmp3, tmp1); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_rgba += 64; - next_rgba += 64; - } -} - -void ARGBToUVJRow_LSX(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* next_argb = src_argb + src_stride_argb; - int len = width / 16; - __m128i src0, src1, src2, src3; - __m128i nex0, nex1, nex2, nex3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, dst0; - __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; - __m128i const_63 = __lsx_vldi(0x43F); - __m128i const_42 = __lsx_vldi(0x42A); - __m128i const_21 = __lsx_vldi(0x415); - __m128i const_53 = __lsx_vldi(0x435); - __m128i const_10 = __lsx_vldi(0x40A); - __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb, - 48, nex0, nex1, nex2, nex3); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp2 = __lsx_vpickev_b(src3, src2); - tmp3 = __lsx_vpickod_b(src3, src2); - tmpr = __lsx_vpickod_b(tmp2, tmp0); - tmpb = __lsx_vpickev_b(tmp2, tmp0); - tmpg = __lsx_vpickev_b(tmp3, tmp1); - tmp0 = __lsx_vpickev_b(nex1, nex0); - tmp1 = __lsx_vpickod_b(nex1, nex0); - tmp2 = __lsx_vpickev_b(nex3, nex2); - tmp3 = __lsx_vpickod_b(nex3, nex2); - nexr = __lsx_vpickod_b(tmp2, tmp0); - nexb = __lsx_vpickev_b(tmp2, tmp0); - nexg = __lsx_vpickev_b(tmp3, tmp1); - tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb); - tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb); - tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg); - tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg); - reg0 = __lsx_vaddwev_h_bu(tmpr, nexr); - reg1 = __lsx_vaddwod_h_bu(tmpr, nexr); - tmpb = __lsx_vavgr_hu(tmp0, tmp1); - tmpg = __lsx_vavgr_hu(tmp2, tmp3); - tmpr = __lsx_vavgr_hu(reg0, reg1); - reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb); - reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr); - reg0 = __lsx_vmsub_h(reg0, const_42, tmpg); - reg1 = __lsx_vmsub_h(reg1, const_53, tmpg); - reg0 = __lsx_vmsub_h(reg0, const_21, tmpr); - reg1 = __lsx_vmsub_h(reg1, const_10, tmpb); - dst0 = __lsx_vpickod_b(reg1, reg0); - __lsx_vstelm_d(dst0, dst_u, 0, 0); - __lsx_vstelm_d(dst0, dst_v, 0, 1); - dst_u += 8; - dst_v += 8; - src_argb += 64; - next_argb += 64; - } -} - -void I444ToARGBRow_LSX(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r; - __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg; - __m128i const_80 = __lsx_vldi(0x480); - __m128i alpha = __lsx_vldi(0xFF); - __m128i zero = __lsx_vldi(0); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - vec_u = __lsx_vld(src_u, 0); - vec_v = __lsx_vld(src_v, 0); - vec_yl = __lsx_vilvl_b(vec_y, vec_y); - vec_ul = __lsx_vilvl_b(zero, vec_u); - vec_vl = __lsx_vilvl_b(zero, vec_v); - I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, - out_b, out_g, out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - vec_yh = __lsx_vilvh_b(vec_y, vec_y); - vec_uh = __lsx_vilvh_b(zero, vec_u); - vec_vh = __lsx_vilvh_b(zero, vec_v); - I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, - out_b, out_g, out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_y += 16; - src_u += 16; - src_v += 16; - } -} - -void I400ToARGBRow_LSX(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 16; - __m128i vec_y, vec_yl, vec_yh, out0; - __m128i y_ev, y_od, dst0, dst1, dst2, dst3; - __m128i temp0, temp1; - __m128i alpha = __lsx_vldi(0xFF); - __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]); - __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - vec_yl = __lsx_vilvl_b(vec_y, vec_y); - y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg); - y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg); - y_ev = __lsx_vsrai_w(y_ev, 16); - y_od = __lsx_vsrai_w(y_od, 16); - y_ev = __lsx_vadd_w(y_ev, vec_yb); - y_od = __lsx_vadd_w(y_od, vec_yb); - y_ev = __lsx_vsrai_w(y_ev, 6); - y_od = __lsx_vsrai_w(y_od, 6); - y_ev = __lsx_vclip255_w(y_ev); - y_od = __lsx_vclip255_w(y_od); - out0 = __lsx_vpackev_h(y_od, y_ev); - temp0 = __lsx_vpackev_b(out0, out0); - temp1 = __lsx_vpackev_b(alpha, out0); - dst0 = __lsx_vilvl_h(temp1, temp0); - dst1 = __lsx_vilvh_h(temp1, temp0); - vec_yh = __lsx_vilvh_b(vec_y, vec_y); - y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg); - y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg); - y_ev = __lsx_vsrai_w(y_ev, 16); - y_od = __lsx_vsrai_w(y_od, 16); - y_ev = __lsx_vadd_w(y_ev, vec_yb); - y_od = __lsx_vadd_w(y_od, vec_yb); - y_ev = __lsx_vsrai_w(y_ev, 6); - y_od = __lsx_vsrai_w(y_od, 6); - y_ev = __lsx_vclip255_w(y_ev); - y_od = __lsx_vclip255_w(y_od); - out0 = __lsx_vpackev_h(y_od, y_ev); - temp0 = __lsx_vpackev_b(out0, out0); - temp1 = __lsx_vpackev_b(alpha, out0); - dst2 = __lsx_vilvl_h(temp1, temp0); - dst3 = __lsx_vilvh_h(temp1, temp0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_y += 16; - } -} - -void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) { - int x; - int len = width / 16; - __m128i vec_y, dst0, dst1, dst2, dst3; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i alpha = __lsx_vldi(0xFF); - - for (x = 0; x < len; x++) { - vec_y = __lsx_vld(src_y, 0); - tmp0 = __lsx_vilvl_b(vec_y, vec_y); - tmp1 = __lsx_vilvh_b(vec_y, vec_y); - tmp2 = __lsx_vilvl_b(alpha, vec_y); - tmp3 = __lsx_vilvh_b(alpha, vec_y); - dst0 = __lsx_vilvl_h(tmp2, tmp0); - dst1 = __lsx_vilvh_h(tmp2, tmp0); - dst2 = __lsx_vilvl_h(tmp3, tmp1); - dst3 = __lsx_vilvh_h(tmp3, tmp1); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - src_y += 16; - } -} - -void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 8; - __m128i src0, vec_y, vec_vu; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; - __m128i vec_vrub, vec_vgug; - __m128i out_b, out_g, out_r; - __m128i const_80 = __lsx_vldi(0x480); - __m128i zero = __lsx_vldi(0); - __m128i alpha = __lsx_vldi(0xFF); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); - vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_yuy2, 0); - vec_y = __lsx_vpickev_b(src0, src0); - vec_vu = __lsx_vpickod_b(src0, src0); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, - out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_yuy2 += 16; - } -} - -void UYVYToARGBRow_LSX(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int len = width / 8; - __m128i src0, vec_y, vec_vu; - __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; - __m128i vec_vrub, vec_vgug; - __m128i out_b, out_g, out_r; - __m128i const_80 = __lsx_vldi(0x480); - __m128i zero = __lsx_vldi(0); - __m128i alpha = __lsx_vldi(0xFF); - - YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); - vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); - vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_uyvy, 0); - vec_y = __lsx_vpickod_b(src0, src0); - vec_vu = __lsx_vpickev_b(src0, src0); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, - out_r); - STOREARGB(alpha, out_r, out_g, out_b, dst_argb); - src_uyvy += 16; - } -} - -void InterpolateRow_LSX(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int32_t source_y_fraction) { - int x; - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* nex_ptr = src_ptr + src_stride; - uint16_t y_fractions; - int len = width / 32; - __m128i src0, src1, nex0, nex1; - __m128i dst0, dst1, y_frac; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i const_128 = __lsx_vldi(0x480); - - if (y1_fraction == 0) { - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); - __lsx_vst(src0, dst_ptr, 0); - __lsx_vst(src1, dst_ptr, 16); - src_ptr += 32; - dst_ptr += 32; - } - return; - } - - if (y1_fraction == 128) { - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); - DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); - dst0 = __lsx_vavgr_bu(src0, nex0); - dst1 = __lsx_vavgr_bu(src1, nex1); - __lsx_vst(dst0, dst_ptr, 0); - __lsx_vst(dst1, dst_ptr, 16); - src_ptr += 32; - nex_ptr += 32; - dst_ptr += 32; - } - return; - } - - y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); - y_frac = __lsx_vreplgr2vr_h(y_fractions); - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); - DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); - tmp0 = __lsx_vilvl_b(nex0, src0); - tmp1 = __lsx_vilvh_b(nex0, src0); - tmp2 = __lsx_vilvl_b(nex1, src1); - tmp3 = __lsx_vilvh_b(nex1, src1); - tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac); - tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac); - tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac); - tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac); - dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8); - dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8); - __lsx_vst(dst0, dst_ptr, 0); - __lsx_vst(dst1, dst_ptr, 16); - src_ptr += 32; - nex_ptr += 32; - dst_ptr += 32; - } -} - -void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) { - int x; - int len = width / 4; - __m128i dst0 = __lsx_vreplgr2vr_w(v32); - - for (x = 0; x < len; x++) { - __lsx_vst(dst0, dst_argb, 0); - dst_argb += 16; - } -} - -void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2; - __m128i dst0, dst1, dst2; - __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06}; - __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A}; - __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1); - src2 = __lsx_vld(src_raw, 32); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1); - dst2 = __lsx_vshuf_b(src1, src2, shuf2); - dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E); - __lsx_vst(dst0, dst_rgb24, 0); - __lsx_vst(dst1, dst_rgb24, 16); - __lsx_vst(dst2, dst_rgb24, 32); - dst_rgb24 += 48; - src_raw += 48; - } -} - -void MergeUVRow_LSX(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, dst0, dst1; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1); - dst0 = __lsx_vilvl_b(src1, src0); - dst1 = __lsx_vilvh_b(src1, src0); - __lsx_vst(dst0, dst_uv, 0); - __lsx_vst(dst1, dst_uv, 16); - src_u += 16; - src_v += 16; - dst_uv += 32; - } -} - -void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vpickod_b(src3, src2); - dst0 = __lsx_vpickod_b(tmp1, tmp0); - __lsx_vst(dst0, dst_a, 0); - src_argb += 64; - dst_a += 16; - } -} - -void ARGBBlendRow_LSX(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, dst0, dst1; - __m128i reg0, reg1, reg2, reg3; - __m128i a0, a1, a2, a3; - __m128i const_256 = __lsx_vldi(0x500); - __m128i zero = __lsx_vldi(0); - __m128i alpha = __lsx_vldi(0xFF); - __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16, - src0, src1, src2, src3); - tmp0 = __lsx_vshuf4i_b(src0, 0xFF); - tmp1 = __lsx_vshuf4i_b(src1, 0xFF); - a0 = __lsx_vilvl_b(zero, tmp0); - a1 = __lsx_vilvh_b(zero, tmp0); - a2 = __lsx_vilvl_b(zero, tmp1); - a3 = __lsx_vilvh_b(zero, tmp1); - reg0 = __lsx_vilvl_b(zero, src2); - reg1 = __lsx_vilvh_b(zero, src2); - reg2 = __lsx_vilvl_b(zero, src3); - reg3 = __lsx_vilvh_b(zero, src3); - DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2, - const_256, a3, a0, a1, a2, a3); - DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1, - reg2, reg3); - DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1); - dst0 = __lsx_vsadd_bu(dst0, src0); - dst1 = __lsx_vsadd_bu(dst1, src1); - dst0 = __lsx_vbitsel_v(dst0, alpha, control); - dst1 = __lsx_vbitsel_v(dst1, alpha, control); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBQuantizeRow_LSX(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128i vec_size = __lsx_vreplgr2vr_b(interval_size); - __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset); - __m128i vec_scale = __lsx_vreplgr2vr_w(scale); - __m128i zero = __lsx_vldi(0); - __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48, - src0, src1, src2, src3); - reg0 = __lsx_vilvl_b(zero, src0); - reg1 = __lsx_vilvh_b(zero, src0); - reg2 = __lsx_vilvl_b(zero, src1); - reg3 = __lsx_vilvh_b(zero, src1); - reg4 = __lsx_vilvl_b(zero, src2); - reg5 = __lsx_vilvh_b(zero, src2); - reg6 = __lsx_vilvl_b(zero, src3); - reg7 = __lsx_vilvh_b(zero, src3); - tmp0 = __lsx_vilvl_h(zero, reg0); - tmp1 = __lsx_vilvh_h(zero, reg0); - tmp2 = __lsx_vilvl_h(zero, reg1); - tmp3 = __lsx_vilvh_h(zero, reg1); - tmp4 = __lsx_vilvl_h(zero, reg2); - tmp5 = __lsx_vilvh_h(zero, reg2); - tmp6 = __lsx_vilvl_h(zero, reg3); - tmp7 = __lsx_vilvh_h(zero, reg3); - DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, - tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, - tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); - DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, - tmp7, tmp6, 16, reg0, reg1, reg2, reg3); - dst0 = __lsx_vpickev_b(reg1, reg0); - dst1 = __lsx_vpickev_b(reg3, reg2); - tmp0 = __lsx_vilvl_h(zero, reg4); - tmp1 = __lsx_vilvh_h(zero, reg4); - tmp2 = __lsx_vilvl_h(zero, reg5); - tmp3 = __lsx_vilvh_h(zero, reg5); - tmp4 = __lsx_vilvl_h(zero, reg6); - tmp5 = __lsx_vilvh_h(zero, reg6); - tmp6 = __lsx_vilvl_h(zero, reg7); - tmp7 = __lsx_vilvh_h(zero, reg7); - DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, - tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, - tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); - DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, - tmp7, tmp6, 16, reg0, reg1, reg2, reg3); - dst2 = __lsx_vpickev_b(reg1, reg0); - dst3 = __lsx_vpickev_b(reg3, reg2); - DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size, - dst3, vec_size, dst0, dst1, dst2, dst3); - DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2, - vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3); - DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2, - src2, control, dst3, src3, control, dst0, dst1, dst2, dst3); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - __lsx_vst(dst2, dst_argb, 32); - __lsx_vst(dst3, dst_argb, 48); - dst_argb += 64; - } -} - -void ARGBColorMatrixRow_LSX(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - int x; - int len = width / 8; - __m128i src0, src1, tmp0, tmp1, dst0, dst1; - __m128i tmp_b, tmp_g, tmp_r, tmp_a; - __m128i reg_b, reg_g, reg_r, reg_a; - __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0); - __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4); - __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8); - __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12); - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r, - src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a); - DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r, - src1, matrix_a, reg_b, reg_g, reg_r, reg_a); - DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a, - tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); - DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a, - reg_a, reg_b, reg_g, reg_r, reg_a); - DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b, - tmp_g, tmp_r, tmp_a); - DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b, - reg_g, reg_r, reg_a); - DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r, - tmp_a) - DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r, - reg_a) - DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a, - tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); - tmp0 = __lsx_vpackev_b(tmp_g, tmp_b); - tmp1 = __lsx_vpackev_b(tmp_a, tmp_r); - dst0 = __lsx_vilvl_h(tmp1, tmp0); - dst1 = __lsx_vilvh_h(tmp1, tmp0); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void SplitUVRow_LSX(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - __m128i src0, src1, src2, src3; - __m128i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0, - src1, src2, src3); - DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1); - DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3); - __lsx_vst(dst0, dst_u, 0); - __lsx_vst(dst1, dst_u, 16); - __lsx_vst(dst2, dst_v, 0); - __lsx_vst(dst3, dst_v, 16); - src_uv += 64; - dst_u += 32; - dst_v += 32; - } -} - -void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) { - int x; - int len = width / 16; - __m128i dst0 = __lsx_vreplgr2vr_b(v8); - - for (x = 0; x < len; x++) { - __lsx_vst(dst0, dst, 0); - dst += 16; - } -} - -void MirrorSplitUVRow_LSX(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - int len = width / 32; - __m128i src0, src1, src2, src3; - __m128i dst0, dst1, dst2, dst3; - __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E}; - __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F}; - - src_uv += (width << 1); - for (x = 0; x < len; x++) { - src_uv -= 64; - DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2, - src3, src0, src1); - DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0, - shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3); - __lsx_vst(dst0, dst_v, 0); - __lsx_vst(dst1, dst_v, 16); - __lsx_vst(dst2, dst_u, 0); - __lsx_vst(dst3, dst_u, 16); - dst_u += 32; - dst_v += 32; - } -} - -void HalfFloatRow_LSX(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - int x; - int len = width / 32; - float mult = 1.9259299444e-34f * scale; - __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0); - __m128i zero = __lsx_vldi(0); - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, - src3); - DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3, - tmp0, tmp2, tmp4, tmp6); - DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3, - tmp1, tmp3, tmp5, tmp7); - DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4, - reg6); - DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5, - reg7); - DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult, - reg3, vec_mult, reg0, reg1, reg2, reg3); - DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult, - reg7, vec_mult, reg4, reg5, reg6, reg7); - DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13, - (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13, - (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7); - DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, - dst0, dst1, dst2, dst3); - __lsx_vst(dst0, dst, 0); - __lsx_vst(dst1, dst, 16); - __lsx_vst(dst2, dst, 32); - __lsx_vst(dst3, dst, 48); - src += 32; - dst += 32; - } -} - -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; - uint16_t pad; -}; - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; - -// ARGB expects first 3 values to contain RGB and 4th value is ignored. -static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants - "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants - "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants - "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants - "1: \n\t" - "vld $vr4, %0, 0 \n\t" - "vld $vr5, %0, 16 \n\t" - "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of - // ARGB - "vor.v $vr12, $vr3, $vr3 \n\t" - "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per - // loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR - "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA - "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B - "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G - "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R - "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t" - "addi.d %0, %0, 64 \n\t" - "vpickod.b $vr10, $vr13, $vr12 \n\t" - "vst $vr10, %1, 0 \n\t" - "addi.d %1, %1, 16 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_argb), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants) - : "memory"); -} - -void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants); -} - -void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} - -void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants); -} - -void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants); -} - -// RGBA expects first value to be A and ignored, then 3 values to contain RGB. -// Same code as ARGB, except the LD4 -static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants - "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants - "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants - "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants - "1: \n\t" - "vld $vr4, %0, 0 \n\t" - "vld $vr5, %0, 16 \n\t" - "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of - // RGBA - "vor.v $vr12, $vr3, $vr3 \n\t" - "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per - // loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG - "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR - "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B - "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G - "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R - "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t" - "addi.d %0, %0, 64 \n\t" - "vpickod.b $vr10, $vr13, $vr12 \n\t" - "vst $vr10, %1, 0 \n\t" - "addi.d %1, %1, 16 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants) - : "memory"); -} - -void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants); -} - -void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); -} - -void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants); -} - -static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, - 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, - 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, - 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, - 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile ( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants - "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants - "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants - "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants - "vld $vr4, %4, 0 \n\t" // load shuff - "vld $vr5, %4, 16 \n\t" - "vld $vr6, %4, 32 \n\t" - "vld $vr7, %4, 48 \n\t" - "1: \n\t" - "vld $vr8, %0, 0 \n\t" - "vld $vr9, %0, 16 \n\t" - "vld $vr10, %0, 32 \n\t" // load 16 pixels of - // RGB - "vor.v $vr12, $vr3, $vr3 \n\t" - "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per - // loop. - "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" - "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" - "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" - "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t" - "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G - "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t" - "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B - "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R - "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t" - "addi.d %0, %0, 48 \n\t" - "vpickod.b $vr10, $vr13, $vr12 \n\t" - "vst $vr10, %1, 0 \n\t" - "addi.d %1, %1, 16 \n\t" - "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), // %3 - "r"(shuff) // %4 - : "memory"); -} - -void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); -} - -void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants); -} - -void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants); -} - -void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/drivers/media/pci/tbscapture2/row_msa.c b/drivers/media/pci/tbscapture2/row_msa.c deleted file mode 100644 index fa40213043f5..000000000000 --- a/drivers/media/pci/tbscapture2/row_msa.c +++ /dev/null @@ -1,3597 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ALPHA_VAL (-1) - -// Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ - { \ - ub = __msa_fill_w(yuvconst->kUVToB[0]); \ - vr = __msa_fill_w(yuvconst->kUVToR[1]); \ - ug = __msa_fill_w(yuvconst->kUVToG[0]); \ - vg = __msa_fill_w(yuvconst->kUVToG[1]); \ - yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ - yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \ - } - -// Load YUV 422 pixel data -#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ - { \ - uint64_t y_m; \ - uint32_t u_m, v_m; \ - v4i32 zero_m = {0}; \ - y_m = LD(psrc_y); \ - u_m = LW(psrc_u); \ - v_m = LW(psrc_v); \ - out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ - out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ - out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ - } - -// Clip input vector elements between 0 to 255 -#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ - { \ - v4i32 max_m = __msa_ldi_w(0xFF); \ - \ - in0 = __msa_maxi_s_w(in0, 0); \ - in1 = __msa_maxi_s_w(in1, 0); \ - in2 = __msa_maxi_s_w(in2, 0); \ - in3 = __msa_maxi_s_w(in3, 0); \ - in4 = __msa_maxi_s_w(in4, 0); \ - in5 = __msa_maxi_s_w(in5, 0); \ - in0 = __msa_min_s_w(max_m, in0); \ - in1 = __msa_min_s_w(max_m, in1); \ - in2 = __msa_min_s_w(max_m, in2); \ - in3 = __msa_min_s_w(max_m, in3); \ - in4 = __msa_min_s_w(max_m, in4); \ - in5 = __msa_min_s_w(max_m, in5); \ - } - -// Convert 8 pixels of YUV 420 to RGB. -#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ - { \ - v8i16 vec0_m, vec1_m; \ - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ - v4i32 reg5_m, reg6_m, reg7_m; \ - v16i8 temp_m, zero_m = {0}; \ - \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ - vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ - reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ - reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ - vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \ - temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \ - reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \ - reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \ - reg0_m *= yg; \ - reg1_m *= yg; \ - reg2_m *= ubvr; \ - reg3_m *= ubvr; \ - reg0_m = __msa_srai_w(reg0_m, 16); \ - reg1_m = __msa_srai_w(reg1_m, 16); \ - reg0_m += yb; \ - reg1_m += yb; \ - reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ - reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ - reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ - reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ - reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ - reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ - reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ - reg5_m = reg0_m + reg5_m; \ - reg6_m = reg1_m + reg6_m; \ - reg2_m = reg0_m + reg2_m; \ - reg3_m = reg1_m + reg3_m; \ - reg7_m = reg0_m - reg7_m; \ - reg4_m = reg1_m - reg4_m; \ - reg5_m = __msa_srai_w(reg5_m, 6); \ - reg6_m = __msa_srai_w(reg6_m, 6); \ - reg7_m = __msa_srai_w(reg7_m, 6); \ - reg4_m = __msa_srai_w(reg4_m, 6); \ - reg2_m = __msa_srai_w(reg2_m, 6); \ - reg3_m = __msa_srai_w(reg3_m, 6); \ - CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ - out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ - out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ - out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - } - -// Pack and Store 8 ARGB values. -#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ - { \ - v8i16 vec0_m, vec1_m; \ - v16u8 dst0_m, dst1_m; \ - vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ - dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ - dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ - ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ - } - -// Takes ARGB input and calculates Y. -#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ - y_out) \ - { \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8u16 reg0_m, reg1_m; \ - \ - vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ - vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ - vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ - vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ - reg0_m = __msa_dotp_u_h(vec0_m, const0); \ - reg1_m = __msa_dotp_u_h(vec1_m, const0); \ - reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ - reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ - reg0_m += const2; \ - reg1_m += const2; \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ - y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - } - -// Loads current and next row of ARGB input and averages it to calculate U and V -#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ - { \ - v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ - v8u16 reg8_m, reg9_m; \ - \ - src0_m = (v16u8)__msa_ld_b((void*)s, 0); \ - src1_m = (v16u8)__msa_ld_b((void*)s, 16); \ - src2_m = (v16u8)__msa_ld_b((void*)s, 32); \ - src3_m = (v16u8)__msa_ld_b((void*)s, 48); \ - src4_m = (v16u8)__msa_ld_b((void*)t, 0); \ - src5_m = (v16u8)__msa_ld_b((void*)t, 16); \ - src6_m = (v16u8)__msa_ld_b((void*)t, 32); \ - src7_m = (v16u8)__msa_ld_b((void*)t, 48); \ - vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ - vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ - vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ - vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ - vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ - vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ - vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ - vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ - reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ - reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ - reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ - reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ - reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ - reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ - reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ - reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ - reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m += const_0x0101; \ - reg9_m += const_0x0101; \ - reg0_m += const_0x0101; \ - reg1_m += const_0x0101; \ - argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ - argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ - argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ - argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ - } - -#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, shift, u_out, v_out) \ - { \ - v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ - \ - vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_w(vec0_m, const0); \ - reg1_m = __msa_dotp_u_w(vec1_m, const0); \ - reg2_m = __msa_dotp_u_w(vec4_m, const0); \ - reg3_m = __msa_dotp_u_w(vec5_m, const0); \ - reg0_m += const1; \ - reg1_m += const1; \ - reg2_m += const1; \ - reg3_m += const1; \ - reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ - reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ - reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ - reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ - reg0_m = __msa_srl_w(reg0_m, shift); \ - reg1_m = __msa_srl_w(reg1_m, shift); \ - reg2_m = __msa_srl_w(reg2_m, shift); \ - reg3_m = __msa_srl_w(reg3_m, shift); \ - u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ - v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - } - -// Takes ARGB input and calculates U and V. -#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, v_out, u_out) \ - { \ - v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ - \ - vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_w(vec0_m, const1); \ - reg1_m = __msa_dotp_u_w(vec1_m, const1); \ - reg2_m = __msa_dotp_u_w(vec4_m, const1); \ - reg3_m = __msa_dotp_u_w(vec5_m, const1); \ - reg0_m += (v4u32)const3; \ - reg1_m += (v4u32)const3; \ - reg2_m += (v4u32)const3; \ - reg3_m += (v4u32)const3; \ - reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ - reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ - reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ - reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ - u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ - u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ - v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ - } - -// Load I444 pixel data -#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ - { \ - uint64_t y_m, u_m, v_m; \ - v2i64 zero_m = {0}; \ - y_m = LD(psrc_y); \ - u_m = LD(psrc_u); \ - v_m = LD(psrc_v); \ - out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ - out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ - out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ - } - -#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ - { \ - v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \ - v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \ - _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \ - _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \ - _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \ - _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \ - _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \ - _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \ - _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \ - _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \ - _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \ - _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \ - _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \ - _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \ - _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \ - _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \ - _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \ - _reg1 = const_8080 + const_112 * _reg0; \ - _reg3 = const_8080 + const_112 * _reg4; \ - _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \ - _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \ - _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \ - _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \ - _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \ - } - -void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - src += width - 64; - - for (x = 0; x < width; x += 64) { - LD_UB4(src, 16, src3, src2, src1, src0); - VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); - VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += 64; - src -= 64; - } -} - -void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - int x; - v8u16 src, dst; - v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; - src_uv += (width - 8) << 1; - for (x = 0; x < width; x += 8) { - src = LD_UH(src_uv); - dst = __msa_vshf_h(shuffler, src, src); - ST_UH(dst, dst_uv); - src_uv -= 16; - dst_uv += 16; - } -} - -void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - src += width * 4 - 64; - - for (x = 0; x < width; x += 16) { - LD_UB4(src, 16, src3, src2, src1, src0); - VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); - VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += 64; - src -= 64; - } -} - -void I422ToYUY2Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - int x; - v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; - v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; - - for (x = 0; x < width; x += 32) { - src_u0 = LD_UB(src_u); - src_v0 = LD_UB(src_v); - LD_UB2(src_y, 16, src_y0, src_y1); - ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); - ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); - ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); - ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); - src_u += 16; - src_v += 16; - src_y += 32; - dst_yuy2 += 64; - } -} - -void I422ToUYVYRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - int x; - v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; - v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; - - for (x = 0; x < width; x += 32) { - src_u0 = LD_UB(src_u); - src_v0 = LD_UB(src_v); - LD_UB2(src_y, 16, src_y0, src_y1); - ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); - ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); - ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); - ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); - src_u += 16; - src_v += 16; - src_y += 32; - dst_uyvy += 64; - } -} - -void I422ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb += 32; - } -} - -void I422ToRGBARow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - STOREARGB(alpha, vec0, vec1, vec2, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb += 32; - } -} - -void I422AlphaToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int64_t data_a; - v16u8 src0, src1, src2, src3; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v4i32 zero = {0}; - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - data_a = LD(src_a); - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); - STOREARGB(vec0, vec1, vec2, src3, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - src_a += 8; - dst_argb += 32; - } -} - -void I422ToRGB24Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int32_t width) { - int x; - int64_t data_u, data_v; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 reg0, reg1, reg2, reg3; - v2i64 zero = {0}; - v8i16 const_0x80 = __msa_ldi_h(0x80); - v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; - v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; - v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, - 11, 29, 12, 13, 30, 14, 15, 31}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); - data_u = LD(src_u); - data_v = LD(src_v); - src1 = (v16u8)__msa_insert_d(zero, 0, data_u); - src2 = (v16u8)__msa_insert_d(zero, 0, data_v); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); - src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5); - reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); - reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); - reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); - ST_UB2(dst0, dst1, dst_argb, 16); - ST_UB(dst2, (dst_argb + 32)); - src_y += 16; - src_u += 8; - src_v += 8; - dst_argb += 48; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void I422ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - vec0 = __msa_srli_h(vec0, 3); - vec1 = __msa_srli_h(vec1, 2); - vec2 = __msa_srli_h(vec2, 3); - vec2 = __msa_slli_h(vec2, 11); - vec1 = __msa_slli_h(vec1, 5); - vec0 |= vec1; - dst0 = (v16u8)(vec2 | vec0); - ST_UB(dst0, dst_rgb565); - src_y += 8; - src_u += 4; - src_v += 4; - dst_rgb565 += 16; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. -void I422ToARGB4444Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); - v8u16 mask = (v8u16)__msa_fill_h(0x00F0); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - reg0 = (v8u16)__msa_srli_h(vec0, 4); - reg2 = (v8u16)__msa_srli_h(vec2, 4); - reg1 = (v8u16)__msa_and_v(vec1, mask); - reg2 = (v8u16)__msa_slli_h(reg2, 8); - reg1 |= const_0xF000; - reg0 |= reg2; - dst0 = (v16u8)(reg1 | reg0); - ST_UB(dst0, dst_argb4444); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb4444 += 16; - } -} - -void I422ToARGB1555Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - reg0 = (v8u16)__msa_srli_h(vec0, 3); - reg1 = (v8u16)__msa_srli_h(vec1, 3); - reg2 = (v8u16)__msa_srli_h(vec2, 3); - reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); - reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); - reg1 |= const_0x8000; - reg0 |= reg2; - dst0 = (v16u8)(reg1 | reg0); - ST_UB(dst0, dst_argb1555); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb1555 += 16; - } -} - -void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_y, 16); - src_yuy2 += 64; - dst_y += 32; - } -} - -void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); - src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec0 = __msa_aver_u_b(src0, src2); - vec1 = __msa_aver_u_b(src1, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_yuy2 += 64; - src_yuy2_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_yuy2 += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_y, 16); - src_uyvy += 64; - dst_y += 32; - } -} - -void UYVYToUVRow_MSA(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); - src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec0 = __msa_aver_u_b(src0, src2); - vec1 = __msa_aver_u_b(src1, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_uyvy += 64; - src_uyvy_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_uyvy += 64; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16i8 zero = {0}; - v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); - v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); - v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); - reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); - reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); - reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); - reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); - reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); - reg0 *= const_0x19; - reg1 *= const_0x19; - reg2 *= const_0x81; - reg3 *= const_0x81; - reg4 *= const_0x42; - reg5 *= const_0x42; - reg0 += reg2; - reg1 += reg3; - reg0 += reg4; - reg1 += reg5; - reg0 += const_0x1080; - reg1 += const_0x1080; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ARGBToUVRow_MSA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* src_argb_next = src_argb + src_stride_argb; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v16u8 dst0, dst1; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); - reg0 = __msa_hadd_u_h(vec8, vec8); - reg1 = __msa_hadd_u_h(vec9, vec9); - reg2 = __msa_hadd_u_h(vec4, vec4); - reg3 = __msa_hadd_u_h(vec5, vec5); - reg4 = __msa_hadd_u_h(vec0, vec0); - reg5 = __msa_hadd_u_h(vec1, vec1); - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); - reg0 += __msa_hadd_u_h(vec8, vec8); - reg1 += __msa_hadd_u_h(vec9, vec9); - reg2 += __msa_hadd_u_h(vec4, vec4); - reg3 += __msa_hadd_u_h(vec5, vec5); - reg4 += __msa_hadd_u_h(vec0, vec0); - reg5 += __msa_hadd_u_h(vec1, vec1); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg4 += const_0x0001; - reg5 += const_0x0001; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); - reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); - reg6 = reg0 * const_0x70; - reg7 = reg1 * const_0x70; - reg8 = reg2 * const_0x4A; - reg9 = reg3 * const_0x4A; - reg6 += const_0x8080; - reg7 += const_0x8080; - reg8 += reg4 * const_0x26; - reg9 += reg5 * const_0x26; - reg0 *= const_0x12; - reg1 *= const_0x12; - reg2 *= const_0x5E; - reg3 *= const_0x5E; - reg4 *= const_0x70; - reg5 *= const_0x70; - reg2 += reg0; - reg3 += reg1; - reg4 += const_0x8080; - reg5 += const_0x8080; - reg6 -= reg8; - reg7 -= reg9; - reg4 -= reg2; - reg5 -= reg3; - reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); - reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_argb += 128; - src_argb_next += 128; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2; - v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; - v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, - 16, 17, 18, 20, 21, 22, 24, 25}; - v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, - 21, 22, 24, 25, 26, 28, 29, 30}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_rgb, 16); - ST_UB(dst2, (dst_rgb + 32)); - src_argb += 64; - dst_rgb += 48; - } -} - -void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2; - v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; - v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, - 18, 17, 16, 22, 21, 20, 26, 25}; - v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, - 21, 20, 26, 25, 24, 30, 29, 28}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_rgb, 16); - ST_UB(dst2, (dst_rgb + 32)); - src_argb += 64; - dst_rgb += 48; - } -} - -void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, dst0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); - vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); - vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); - vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); - vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); - vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); - vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); - vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); - vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); - vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); - vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); - vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); - vec0 = __msa_binsli_b(vec0, vec1, 2); - vec1 = __msa_binsli_b(vec2, vec3, 4); - vec4 = __msa_binsli_b(vec4, vec5, 2); - vec5 = __msa_binsli_b(vec6, vec7, 4); - vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); - dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - v16u8 src0, src1, dst0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); - vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); - vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); - vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); - vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); - vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); - vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); - vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); - vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); - vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); - vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); - vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); - vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); - vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); - vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); - vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); - vec0 = __msa_binsli_b(vec0, vec1, 2); - vec5 = __msa_binsli_b(vec5, vec6, 2); - vec1 = __msa_binsli_b(vec2, vec3, 5); - vec6 = __msa_binsli_b(vec7, vec8, 5); - vec1 = __msa_binsli_b(vec1, vec4, 0); - vec6 = __msa_binsli_b(vec6, vec9, 0); - vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); - dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - v16u8 src0, src1; - v16u8 vec0, vec1; - v16u8 dst0; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); - vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); - src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); - src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); - vec0 = __msa_binsli_b(vec0, src0, 3); - vec1 = __msa_binsli_b(vec1, src1, 3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToUV444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int32_t width) { - int32_t x; - v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 vec8, vec9, vec10, vec11; - v8u16 const_112 = (v8u16)__msa_ldi_h(112); - v8u16 const_74 = (v8u16)__msa_ldi_h(74); - v8u16 const_38 = (v8u16)__msa_ldi_h(38); - v8u16 const_94 = (v8u16)__msa_ldi_h(94); - v8u16 const_18 = (v8u16)__msa_ldi_h(18); - v8u16 const_32896 = (v8u16)__msa_fill_h(32896); - v16i8 zero = {0}; - - for (x = width; x > 0; x -= 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); - vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); - vec10 = vec0 * const_18; - vec11 = vec1 * const_18; - vec8 = vec2 * const_94; - vec9 = vec3 * const_94; - vec6 = vec4 * const_112; - vec7 = vec5 * const_112; - vec0 *= const_112; - vec1 *= const_112; - vec2 *= const_74; - vec3 *= const_74; - vec4 *= const_38; - vec5 *= const_38; - vec8 += vec10; - vec9 += vec11; - vec6 += const_32896; - vec7 += const_32896; - vec0 += const_32896; - vec1 += const_32896; - vec2 += vec4; - vec3 += vec5; - vec0 -= vec2; - vec1 -= vec3; - vec6 -= vec8; - vec7 -= vec9; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); - vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_argb += 64; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBMultiplyRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, dst0; - v8u16 vec0, vec1, vec2, vec3; - v4u32 reg0, reg1, reg2, reg3; - v8i16 zero = {0}; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); - reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); - reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); - reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_argb); - src_argb += 16; - src_argb1 += 16; - dst_argb += 16; - } -} - -void ARGBAddRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - dst0 = __msa_adds_u_b(src0, src2); - dst1 = __msa_adds_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBSubtractRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - dst0 = __msa_subs_u_b(src0, src2); - dst1 = __msa_subs_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBAttenuateRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 zero = {0}; - v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); - vec4 = (v8u16)__msa_fill_h(vec0[3]); - vec5 = (v8u16)__msa_fill_h(vec0[7]); - vec6 = (v8u16)__msa_fill_h(vec1[3]); - vec7 = (v8u16)__msa_fill_h(vec1[7]); - vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - vec6 = (v8u16)__msa_fill_h(vec2[3]); - vec7 = (v8u16)__msa_fill_h(vec2[7]); - vec8 = (v8u16)__msa_fill_h(vec3[3]); - vec9 = (v8u16)__msa_fill_h(vec3[7]); - vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); - reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); - reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); - reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); - reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); - reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); - reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); - reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); - reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); - reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); - reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); - reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); - reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); - vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst0 = __msa_bmnz_v(dst0, src0, mask); - dst1 = __msa_bmnz_v(dst1, src1, mask); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - int x; - v16u8 src0, src1, dst0, vec0, vec1; - v8i16 vec_d0; - v8i16 reg0, reg1, reg2; - v16i8 zero = {0}; - v8i16 max = __msa_ldi_h(0xFF); - - vec_d0 = (v8i16)__msa_fill_w(dither4); - vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); - reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); - reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); - reg0 += vec_d0; - reg1 += vec_d0; - reg2 += vec_d0; - reg0 = __msa_maxi_s_h((v8i16)reg0, 0); - reg1 = __msa_maxi_s_h((v8i16)reg1, 0); - reg2 = __msa_maxi_s_h((v8i16)reg2, 0); - reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); - reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); - reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); - reg0 = __msa_srai_h(reg0, 3); - reg2 = __msa_srai_h(reg2, 3); - reg1 = __msa_srai_h(reg1, 2); - reg2 = __msa_slli_h(reg2, 11); - reg1 = __msa_slli_h(reg1, 5); - reg0 |= reg1; - dst0 = (v16u8)(reg0 | reg2); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBShuffleRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - v16i8 vec0; - v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - int32_t val = LW((int32_t*)shuffler); - - vec0 = (v16i8)__msa_fill_w(val); - shuffler_vec += vec0; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBShadeRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - int x; - v16u8 src0, dst0; - v8u16 vec0, vec1; - v4u32 reg0, reg1, reg2, reg3, rgba_scale; - v8i16 zero = {0}; - - rgba_scale[0] = value; - rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); - rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg0 *= rgba_scale; - reg1 *= rgba_scale; - reg2 *= rgba_scale; - reg3 *= rgba_scale; - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_argb); - src_argb += 16; - dst_argb += 16; - } -} - -void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, vec0, vec1, dst0, dst1; - v8u16 reg0; - v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); - v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); - vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - reg0 = __msa_dotp_u_h(vec0, const_0x961D); - reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); - vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); - vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2; - v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); - v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); - v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); - v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); - v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); - v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); - v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); - vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); - vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); - reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); - reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); - reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); - reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); - reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); - reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); - reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); - reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); - vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); - vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); - vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); - vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); - ST_UB2(dst0, dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1; - v8u16 vec0, vec1, vec2, vec3; - v16u8 dst0, dst1, dst2, dst3; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); - vec0 = (v8u16)__msa_andi_b(src0, 0x0F); - vec1 = (v8u16)__msa_andi_b(src1, 0x0F); - vec2 = (v8u16)__msa_andi_b(src0, 0xF0); - vec3 = (v8u16)__msa_andi_b(src1, 0xF0); - vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); - vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); - vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); - vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_argb4444 += 32; - dst_argb += 64; - } -} - -void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - int x; - v8u16 src0, src1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; - v16u8 dst0, dst1, dst2, dst3; - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); - reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); - reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); - reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); - reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); - reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); - reg3 = -reg3; - reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); - reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); - reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); - reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); - dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_argb1555 += 32; - dst_argb += 64; - } -} - -void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); - v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16); - vec0 = src0 & const_0x1F; - vec1 = src0 & const_0x7E0; - vec2 = src0 & const_0xF800; - vec3 = src1 & const_0x1F; - vec4 = src1 & const_0x7E0; - vec5 = src1 & const_0xF800; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); - reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); - reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); - reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); - reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); - reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); - reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); - res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); - res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); - res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_rgb565 += 32; - dst_argb += 64; - } -} - -void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2; - v16u8 vec0, vec1, vec2; - v16u8 dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0); - src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16); - src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32); - vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); - vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); - dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); - dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_rgb24 += 48; - dst_argb += 64; - } -} - -void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, src2; - v16u8 vec0, vec1, vec2; - v16u8 dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); - vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); - vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); - dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); - dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_raw += 48; - dst_argb += 64; - } -} - -void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - int x; - v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; - v16u8 reg0, reg1, reg2, dst; - v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; - v8i16 res0, res1; - v8i16 const_66 = (v8i16)__msa_ldi_h(66); - v8i16 const_129 = (v8i16)__msa_ldi_h(129); - v8i16 const_25 = (v8i16)__msa_ldi_h(25); - v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080); - v16u8 zero = (v16u8)__msa_ldi_b(0); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16); - tmp0 = (v16u8)__msa_pckev_b(src1, src0); - tmp1 = (v16u8)__msa_pckod_b(src1, src0); - tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); - tmpg = (v16u8)__msa_srli_b(tmp0, 5); - reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); - reg0 = (v16u8)__msa_slli_b(reg0, 3); - tmpg = (v16u8)__msa_or_v(tmpg, reg0); - reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); - tmpr = (v16u8)__msa_srli_b(reg1, 2); - reg0 = (v16u8)__msa_slli_b(tmpb, 3); - reg1 = (v16u8)__msa_slli_b(tmpg, 3); - reg2 = (v16u8)__msa_slli_b(tmpr, 3); - tmpb = (v16u8)__msa_srli_b(tmpb, 2); - tmpg = (v16u8)__msa_srli_b(tmpg, 2); - tmpr = (v16u8)__msa_srli_b(tmpr, 2); - tmpb = (v16u8)__msa_or_v(reg0, tmpb); - tmpg = (v16u8)__msa_or_v(reg1, tmpg); - tmpr = (v16u8)__msa_or_v(reg2, tmpr); - tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); - tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); - tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); - tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); - tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); - tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); - res0 = const_1080 + const_25 * tmpb_r; - res1 = const_1080 + const_25 * tmpb_l; - res0 += const_129 * tmpg_r; - res1 += const_129 * tmpg_l; - res0 += const_66 * tmpr_r; - res1 += const_66 * tmpr_l; - dst = (v16u8)__msa_pckod_b(res1, res0); - ST_UB(dst, dst_y); - src_argb1555 += 32; - dst_y += 16; - } -} - -void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; - v16u8 reg0, reg1, dst; - v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; - v8i16 res0, res1; - v8i16 const_66 = (v8i16)__msa_ldi_h(66); - v8i16 const_129 = (v8i16)__msa_ldi_h(129); - v8i16 const_25 = (v8i16)__msa_ldi_h(25); - v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080); - v16u8 zero = __msa_ldi_b(0); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0); - src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16); - tmp0 = (v16u8)__msa_pckev_b(src1, src0); - tmp1 = (v16u8)__msa_pckod_b(src1, src0); - tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); - tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); - reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); - reg0 = (v16u8)__msa_srli_b(tmp0, 5); - reg1 = (v16u8)__msa_slli_b(reg1, 3); - tmpg = (v16u8)__msa_or_v(reg1, reg0); - reg0 = (v16u8)__msa_slli_b(tmpb, 3); - reg1 = (v16u8)__msa_srli_b(tmpb, 2); - tmpb = (v16u8)__msa_or_v(reg1, reg0); - reg0 = (v16u8)__msa_slli_b(tmpg, 2); - reg1 = (v16u8)__msa_srli_b(tmpg, 4); - tmpg = (v16u8)__msa_or_v(reg1, reg0); - reg0 = (v16u8)__msa_srli_b(tmpr, 5); - tmpr = (v16u8)__msa_or_v(tmpr, reg0); - tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); - tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); - tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); - tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); - tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); - tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); - res0 = const_1080 + const_25 * tmpb_r; - res1 = const_1080 + const_25 * tmpb_l; - res0 += const_129 * tmpg_r; - res1 += const_129 * tmpg_l; - res0 += const_66 * tmpr_r; - res1 += const_66 * tmpr_l; - dst = (v16u8)__msa_pckod_b(res1, res0); - ST_UB(dst, dst_y); - src_rgb565 += 32; - dst_y += 16; - } -} - -void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); - v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, - 18, 19, 20, 21, 21, 22, 23, 24}; - v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; - v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); - reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); - reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); - vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); - vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); - vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); - vec0 += const_0x1080; - vec1 += const_0x1080; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); - src_argb += 48; - dst_y += 16; - } -} - -void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); - v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, - 18, 19, 20, 21, 21, 22, 23, 24}; - v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; - v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); - reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); - reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); - vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); - vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); - vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); - vec0 += const_0x1080; - vec1 += const_0x1080; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); - src_argb += 48; - dst_y += 16; - } -} - -void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint16_t* s = (const uint16_t*)src_argb1555; - const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); - int64_t res0, res1; - v16u8 src0, src1, src2, src3, dst; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16u8 reg0, reg1, reg2, reg3; - v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; - v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); - v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); - v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); - v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); - v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); - v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)s, 0); - src1 = (v8u16)__msa_ld_b((void*)s, 16); - src2 = (v8u16)__msa_ld_b((void*)t, 0); - src3 = (v8u16)__msa_ld_b((void*)t, 16); - tmp0 = (v16u8)__msa_pckev_b(src1, src0); - tmp1 = (v16u8)__msa_pckod_b(src1, src0); - tmp2 = (v16u8)__msa_pckev_b(src3, src2); - tmp3 = (v16u8)__msa_pckod_b(src3, src2); - tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); - nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); - tmpg = (v16u8)__msa_srli_b(tmp0, 5); - nexg = (v16u8)__msa_srli_b(tmp2, 5); - reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); - reg2 = (v16u8)__msa_andi_b(tmp3, 0x03); - reg0 = (v16u8)__msa_slli_b(reg0, 3); - reg2 = (v16u8)__msa_slli_b(reg2, 3); - tmpg = (v16u8)__msa_or_v(tmpg, reg0); - nexg = (v16u8)__msa_or_v(nexg, reg2); - reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); - reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C); - tmpr = (v16u8)__msa_srli_b(reg1, 2); - nexr = (v16u8)__msa_srli_b(reg3, 2); - reg0 = (v16u8)__msa_slli_b(tmpb, 3); - reg1 = (v16u8)__msa_slli_b(tmpg, 3); - reg2 = (v16u8)__msa_slli_b(tmpr, 3); - tmpb = (v16u8)__msa_srli_b(tmpb, 2); - tmpg = (v16u8)__msa_srli_b(tmpg, 2); - tmpr = (v16u8)__msa_srli_b(tmpr, 2); - tmpb = (v16u8)__msa_or_v(reg0, tmpb); - tmpg = (v16u8)__msa_or_v(reg1, tmpg); - tmpr = (v16u8)__msa_or_v(reg2, tmpr); - reg0 = (v16u8)__msa_slli_b(nexb, 3); - reg1 = (v16u8)__msa_slli_b(nexg, 3); - reg2 = (v16u8)__msa_slli_b(nexr, 3); - nexb = (v16u8)__msa_srli_b(nexb, 2); - nexg = (v16u8)__msa_srli_b(nexg, 2); - nexr = (v16u8)__msa_srli_b(nexr, 2); - nexb = (v16u8)__msa_or_v(reg0, nexb); - nexg = (v16u8)__msa_or_v(reg1, nexg); - nexr = (v16u8)__msa_or_v(reg2, nexr); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); - res0 = __msa_copy_u_d((v2i64)dst, 0); - res1 = __msa_copy_u_d((v2i64)dst, 1); - SD(res0, dst_u); - SD(res1, dst_v); - s += 16; - t += 16; - dst_u += 8; - dst_v += 8; - } -} - -void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint16_t* s = (const uint16_t*)src_rgb565; - const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); - int64_t res0, res1; - v16u8 src0, src1, src2, src3, dst; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16u8 reg0, reg1, reg2, reg3; - v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; - v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); - v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); - v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); - v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); - v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); - v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)t, 0); - src3 = (v16u8)__msa_ld_b((void*)t, 16); - tmp0 = (v16u8)__msa_pckev_b(src1, src0); - tmp1 = (v16u8)__msa_pckod_b(src1, src0); - tmp2 = (v16u8)__msa_pckev_b(src3, src2); - tmp3 = (v16u8)__msa_pckod_b(src3, src2); - tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); - tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); - nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); - nexr = (v16u8)__msa_andi_b(tmp3, 0xF8); - reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); - reg3 = (v16u8)__msa_andi_b(tmp3, 0x07); - reg0 = (v16u8)__msa_srli_b(tmp0, 5); - reg1 = (v16u8)__msa_slli_b(reg1, 3); - reg2 = (v16u8)__msa_srli_b(tmp2, 5); - reg3 = (v16u8)__msa_slli_b(reg3, 3); - tmpg = (v16u8)__msa_or_v(reg1, reg0); - nexg = (v16u8)__msa_or_v(reg2, reg3); - reg0 = (v16u8)__msa_slli_b(tmpb, 3); - reg1 = (v16u8)__msa_srli_b(tmpb, 2); - reg2 = (v16u8)__msa_slli_b(nexb, 3); - reg3 = (v16u8)__msa_srli_b(nexb, 2); - tmpb = (v16u8)__msa_or_v(reg1, reg0); - nexb = (v16u8)__msa_or_v(reg2, reg3); - reg0 = (v16u8)__msa_slli_b(tmpg, 2); - reg1 = (v16u8)__msa_srli_b(tmpg, 4); - reg2 = (v16u8)__msa_slli_b(nexg, 2); - reg3 = (v16u8)__msa_srli_b(nexg, 4); - tmpg = (v16u8)__msa_or_v(reg1, reg0); - nexg = (v16u8)__msa_or_v(reg2, reg3); - reg0 = (v16u8)__msa_srli_b(tmpr, 5); - reg2 = (v16u8)__msa_srli_b(nexr, 5); - tmpr = (v16u8)__msa_or_v(tmpr, reg0); - nexr = (v16u8)__msa_or_v(nexr, reg2); - RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); - res0 = __msa_copy_u_d((v2i64)dst, 0); - res1 = __msa_copy_u_d((v2i64)dst, 1); - SD(res0, dst_u); - SD(res1, dst_v); - s += 16; - t += 16; - dst_u += 8; - dst_v += 8; - } -} - -void RGB24ToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - int64_t res0, res1; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 inp0, inp1, inp2, inp3, inp4, inp5; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 reg0, reg1, reg2, reg3; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((void*)s, 0); - inp1 = (v16u8)__msa_ld_b((void*)s, 16); - inp2 = (v16u8)__msa_ld_b((void*)s, 32); - inp3 = (v16u8)__msa_ld_b((void*)t, 0); - inp4 = (v16u8)__msa_ld_b((void*)t, 16); - inp5 = (v16u8)__msa_ld_b((void*)t, 32); - src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); - src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); - src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); - src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); - src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); - src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); - src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); - src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); - src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); - src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); - src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); - src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); - src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); - src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); - vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); - reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); - reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); - reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); - reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); - reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg0 = __msa_srai_h((v8i16)reg0, 1); - reg1 = __msa_srai_h((v8i16)reg1, 1); - reg2 = __msa_srai_h((v8i16)reg2, 1); - reg3 = __msa_srai_h((v8i16)reg3, 1); - vec4 = (v8u16)__msa_pckev_h(reg1, reg0); - vec5 = (v8u16)__msa_pckev_h(reg3, reg2); - vec6 = (v8u16)__msa_pckod_h(reg1, reg0); - vec7 = (v8u16)__msa_pckod_h(reg3, reg2); - vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); - vec3 = vec0 * const_0x70; - vec4 = vec1 * const_0x4A; - vec5 = vec2 * const_0x26; - vec2 *= const_0x70; - vec1 *= const_0x5E; - vec0 *= const_0x12; - reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); - reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); - reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); - reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); - reg0 += reg1; - reg2 += reg3; - reg0 = __msa_srai_h(reg0, 8); - reg2 = __msa_srai_h(reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - t += 48; - s += 48; - dst_u += 8; - dst_v += 8; - } -} - -void RAWToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - int64_t res0, res1; - v16u8 inp0, inp1, inp2, inp3, inp4, inp5; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 reg0, reg1, reg2, reg3; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((void*)s, 0); - inp1 = (v16u8)__msa_ld_b((void*)s, 16); - inp2 = (v16u8)__msa_ld_b((void*)s, 32); - inp3 = (v16u8)__msa_ld_b((void*)t, 0); - inp4 = (v16u8)__msa_ld_b((void*)t, 16); - inp5 = (v16u8)__msa_ld_b((void*)t, 32); - src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); - src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); - src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); - src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); - src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); - src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); - src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); - src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); - src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); - src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); - src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); - src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); - src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); - src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); - vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); - reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); - reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); - reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); - reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); - reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg0 = __msa_srai_h(reg0, 1); - reg1 = __msa_srai_h(reg1, 1); - reg2 = __msa_srai_h(reg2, 1); - reg3 = __msa_srai_h(reg3, 1); - vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); - vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - vec3 = vec0 * const_0x70; - vec4 = vec1 * const_0x4A; - vec5 = vec2 * const_0x26; - vec2 *= const_0x70; - vec1 *= const_0x5E; - vec0 *= const_0x12; - reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); - reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); - reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); - reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); - reg0 += reg1; - reg2 += reg3; - reg0 = __msa_srai_h(reg0, 8); - reg2 = __msa_srai_h(reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - t += 48; - s += 48; - dst_u += 8; - dst_v += 8; - } -} - -void NV12ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, res0, res1, dst0, dst1; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 zero = {0}; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_uv); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_uv += 8; - dst_argb += 32; - } -} - -void NV12ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, dst0; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v8i16 const_0x80 = __msa_ldi_h(0x80); - v16u8 zero = {0}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_uv); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - vec0 = vec0 >> 3; - vec1 = (vec1 >> 2) << 5; - vec2 = (vec2 >> 3) << 11; - dst0 = (v16u8)(vec0 | vec1 | vec2); - ST_UB(dst0, dst_rgb565); - src_y += 8; - src_uv += 8; - dst_rgb565 += 16; - } -} - -void NV21ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, res0, res1, dst0, dst1; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16u8 zero = {0}; - v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_vu); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_vu += 8; - dst_argb += 32; - } -} - -void SobelRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; - v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; - v16i8 const_0x4 = __msa_ldi_b(0x4); - v16i8 mask1 = mask0 + const_0x4; - v16i8 mask2 = mask1 + const_0x4; - v16i8 mask3 = mask2 + const_0x4; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - vec0 = __msa_adds_u_b(src0, src1); - dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); - dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); - dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16); - src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16); - dst0 = __msa_adds_u_b(src0, src2); - dst1 = __msa_adds_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_y, 16); - src_sobelx += 32; - src_sobely += 32; - dst_y += 32; - } -} - -void SobelXYRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, vec0, vec1, vec2; - v16u8 reg0, reg1, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - vec0 = __msa_adds_u_b(src0, src1); - vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); - vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); - reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); - reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); - dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); - dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); - v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); - v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); - v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); - v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); - v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ARGBToUVJRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - v8u16 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 vec0, vec1, vec2, vec3; - v8u16 dst0, dst1, dst2, dst3; - v16u8 zero = {0}; - v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; - v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; - v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; - v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); - v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); - v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); - v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); - v4i32 shift = __msa_fill_w(0x00000008); - - for (x = 0; x < width; x += 32) { - src1 = __msa_ld_b((void*)s, 0); - src3 = __msa_ld_b((void*)s, 16); - src5 = __msa_ld_b((void*)t, 0); - src7 = __msa_ld_b((void*)t, 16); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec0 = __msa_aver_u_h(src4, src5); - vec1 = __msa_aver_u_h(src6, src7); - - src1 = __msa_ld_b((void*)s, 32); - src3 = __msa_ld_b((void*)s, 48); - src5 = __msa_ld_b((void*)t, 32); - src7 = __msa_ld_b((void*)t, 48); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec2 = __msa_aver_u_h(src4, src5); - vec3 = __msa_aver_u_h(src6, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, - const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, - shuffler2, shuffler3, shift, dst0, dst1); - - src1 = __msa_ld_b((void*)s, 64); - src3 = __msa_ld_b((void*)s, 80); - src5 = __msa_ld_b((void*)t, 64); - src7 = __msa_ld_b((void*)t, 80); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec0 = __msa_aver_u_h(src4, src5); - vec1 = __msa_aver_u_h(src6, src7); - - src1 = __msa_ld_b((void*)s, 96); - src3 = __msa_ld_b((void*)s, 112); - src5 = __msa_ld_b((void*)t, 96); - src7 = __msa_ld_b((void*)t, 112); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec2 = __msa_aver_u_h(src4, src5); - vec3 = __msa_aver_u_h(src6, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, - const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, - shuffler2, shuffler3, shift, dst2, dst3); - - dst0 = (v8u16)__msa_pckev_b(dst2, dst0); - dst1 = (v8u16)__msa_pckev_b(dst3, dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_v += 16; - dst_u += 16; - } -} - -void BGRAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; - v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; - v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; - v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void ABGRToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; - v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; - v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void RGBAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; - v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; - v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; - v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void I444ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0, dst1; - v8i16 vec0, vec1, vec2; - v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 zero = {0}; - v4i32 const_0x80 = __msa_fill_w(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - - for (x = 0; x < width; x += 8) { - READI444(src_y, src_u, src_v, src0, src1, src2); - vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - reg0 *= vec_yg; - reg1 *= vec_yg; - reg0 = __msa_srai_w(reg0, 16); - reg1 = __msa_srai_w(reg1, 16); - reg0 += vec_yb; - reg1 += vec_yb; - vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); - reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); - reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - reg6 -= const_0x80; - reg7 -= const_0x80; - reg8 -= const_0x80; - reg9 -= const_0x80; - tmp0 = reg0 + reg6 * vec_ub; - tmp1 = reg1 + reg7 * vec_ub; - tmp2 = reg0 + reg8 * vec_vr; - tmp3 = reg1 + reg9 * vec_vr; - tmp4 = reg6 * vec_ug; - tmp5 = reg7 * vec_ug; - tmp4 += reg8 * vec_vg; - tmp5 += reg9 * vec_vg; - tmp4 = reg0 - tmp4; - tmp5 = reg1 - tmp5; - reg0 = __msa_srai_w(tmp0, 6); - reg1 = __msa_srai_w(tmp1, 6); - reg2 = __msa_srai_w(tmp2, 6); - reg3 = __msa_srai_w(tmp3, 6); - reg4 = __msa_srai_w(tmp4, 6); - reg5 = __msa_srai_w(tmp5, 6); - CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); - vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); - dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); - dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_u += 8; - src_v += 8; - dst_argb += 32; - } -} - -// TODO - respect YuvConstants -void I400ToARGBRow_MSA(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; -#if defined(__aarch64__) || defined(__arm__) - int ygb = yuvconstants->kUVBiasBGR[3]; - int yg = yuvconstants->kYToRgb[1]; -#else - int ygb = yuvconstants->kYBiasToRgb[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; - v8i16 vec0, vec1; - v4i32 reg0, reg1, reg2, reg3; - v4i32 vec_yg = __msa_fill_w(yg); - v8i16 vec_ygb = __msa_fill_h(ygb); - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 max = __msa_ldi_h(0xFF); - v8i16 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y, 0); - vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - reg0 = (v4i32)__msa_ilvr_h(zero, vec0); - reg1 = (v4i32)__msa_ilvl_h(zero, vec0); - reg2 = (v4i32)__msa_ilvr_h(zero, vec1); - reg3 = (v4i32)__msa_ilvl_h(zero, vec1); - reg0 *= vec_yg; - reg1 *= vec_yg; - reg2 *= vec_yg; - reg3 *= vec_yg; - reg0 = __msa_srai_w(reg0, 16); - reg1 = __msa_srai_w(reg1, 16); - reg2 = __msa_srai_w(reg2, 16); - reg3 = __msa_srai_w(reg3, 16); - vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec0 += vec_ygb; - vec1 += vec_ygb; - vec0 = __msa_srai_h(vec0, 6); - vec1 = __msa_srai_h(vec1, 6); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); - res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); - res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); - res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); - dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_y += 16; - dst_argb += 64; - } -} - -void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y, 0); - vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); - vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_y += 16; - dst_argb += 64; - } -} - -void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 const_0x80 = __msa_ldi_h(0x80); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); - src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_yuy2 += 16; - dst_argb += 32; - } -} - -void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; - v4i32 vec_ubvr, vec_ugvg; - v8i16 const_0x80 = __msa_ldi_h(0x80); - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); - src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_uyvy += 16; - dst_argb += 32; - } -} - -void InterpolateRow_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int32_t source_y_fraction) { - int32_t y1_fraction = source_y_fraction; - int32_t y0_fraction = 256 - y1_fraction; - uint16_t y_fractions; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, y_frac; - - if (0 == y1_fraction) { - memcpy(dst_ptr, src_ptr, width); - return; - } - - if (128 == y1_fraction) { - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)t, 0); - src3 = (v16u8)__msa_ld_b((void*)t, 16); - dst0 = __msa_aver_u_b(src0, src2); - dst1 = __msa_aver_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_ptr, 16); - s += 32; - t += 32; - dst_ptr += 32; - } - return; - } - - y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); - y_frac = (v8u16)__msa_fill_h(y_fractions); - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)t, 0); - src3 = (v16u8)__msa_ld_b((void*)t, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); - vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); - vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); - vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); - vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); - vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); - vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - ST_UB2(dst0, dst1, dst_ptr, 16); - s += 32; - t += 32; - dst_ptr += 32; - } -} - -void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { - int x; - v4i32 dst0 = __builtin_msa_fill_w(v32); - - for (x = 0; x < width; x += 4) { - ST_UB(dst0, dst_argb); - dst_argb += 16; - } -} - -void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - int x; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; - v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; - v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, - 18, 17, 16, 21, 20, 19, 24, 23}; - v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, - 24, 23, 28, 27, 26, 31, 30, 29}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); - src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); - src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); - ST_UB2(dst0, dst1, dst_rgb24, 16); - ST_UB(dst2, (dst_rgb24 + 32)); - src_raw += 48; - dst_rgb24 += 48; - } -} - -void MergeUVRow_MSA(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_u, 0); - src1 = (v16u8)__msa_ld_b((void*)src_v, 0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); - ST_UB2(dst0, dst1, dst_uv, 16); - src_u += 16; - src_v += 16; - dst_uv += 32; - } -} - -void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - int i; - v16u8 src0, src1, src2, src3, vec0, vec1, dst0; - - for (i = 0; i < width; i += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_a); - src_argb += 64; - dst_a += 16; - } -} - -void ARGBBlendRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 vec8, vec9, vec10, vec11, vec12, vec13; - v8u16 const_256 = (v8u16)__msa_ldi_h(256); - v16u8 const_255 = (v16u8)__msa_ldi_b(255); - v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); - vec8 = (v8u16)__msa_fill_h(vec0[3]); - vec9 = (v8u16)__msa_fill_h(vec0[7]); - vec10 = (v8u16)__msa_fill_h(vec1[3]); - vec11 = (v8u16)__msa_fill_h(vec1[7]); - vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); - vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); - vec10 = (v8u16)__msa_fill_h(vec2[3]); - vec11 = (v8u16)__msa_fill_h(vec2[7]); - vec12 = (v8u16)__msa_fill_h(vec3[3]); - vec13 = (v8u16)__msa_fill_h(vec3[7]); - vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); - vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); - vec8 = const_256 - vec8; - vec9 = const_256 - vec9; - vec10 = const_256 - vec10; - vec11 = const_256 - vec11; - vec8 *= vec4; - vec9 *= vec5; - vec10 *= vec6; - vec11 *= vec7; - vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); - vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); - vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); - vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); - dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); - dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); - dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); - dst0 = __msa_bmnz_v(dst0, const_255, mask); - dst1 = __msa_bmnz_v(dst1, const_255, mask); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBQuantizeRow_MSA(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - v4i32 vec_scale = __msa_fill_w(scale); - v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); - v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); - v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48); - vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); - vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); - tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); - tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); - tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); - tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); - tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); - tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); - tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); - tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); - tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); - tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); - tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); - tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); - tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); - tmp0 *= vec_scale; - tmp1 *= vec_scale; - tmp2 *= vec_scale; - tmp3 *= vec_scale; - tmp4 *= vec_scale; - tmp5 *= vec_scale; - tmp6 *= vec_scale; - tmp7 *= vec_scale; - tmp8 *= vec_scale; - tmp9 *= vec_scale; - tmp10 *= vec_scale; - tmp11 *= vec_scale; - tmp12 *= vec_scale; - tmp13 *= vec_scale; - tmp14 *= vec_scale; - tmp15 *= vec_scale; - tmp0 >>= 16; - tmp1 >>= 16; - tmp2 >>= 16; - tmp3 >>= 16; - tmp4 >>= 16; - tmp5 >>= 16; - tmp6 >>= 16; - tmp7 >>= 16; - tmp8 >>= 16; - tmp9 >>= 16; - tmp10 >>= 16; - tmp11 >>= 16; - tmp12 >>= 16; - tmp13 >>= 16; - tmp14 >>= 16; - tmp15 >>= 16; - vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); - vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); - vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); - vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - dst0 *= vec_int_sz; - dst1 *= vec_int_sz; - dst2 *= vec_int_sz; - dst3 *= vec_int_sz; - dst0 += vec_int_ofst; - dst1 += vec_int_ofst; - dst2 += vec_int_ofst; - dst3 += vec_int_ofst; - dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); - dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); - dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); - dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - dst_argb += 64; - } -} - -void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - int32_t x; - v16i8 src0; - v16u8 src1, src2, dst0, dst1; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - v16i8 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - src0 = __msa_ld_b((void*)matrix_argb, 0); - vec0 = (v8i16)__msa_ilvr_b(zero, src0); - vec1 = (v8i16)__msa_ilvl_b(zero, src0); - - for (x = 0; x < width; x += 8) { - src1 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); - vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); - vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); - vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); - vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); - vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); - vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); - vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); - vec10 = vec2 * vec0; - vec11 = vec2 * vec1; - vec12 = vec6 * vec0; - vec13 = vec6 * vec1; - tmp0 = __msa_hadd_s_w(vec10, vec10); - tmp1 = __msa_hadd_s_w(vec11, vec11); - tmp2 = __msa_hadd_s_w(vec12, vec12); - tmp3 = __msa_hadd_s_w(vec13, vec13); - vec14 = vec3 * vec0; - vec15 = vec3 * vec1; - vec16 = vec7 * vec0; - vec17 = vec7 * vec1; - tmp4 = __msa_hadd_s_w(vec14, vec14); - tmp5 = __msa_hadd_s_w(vec15, vec15); - tmp6 = __msa_hadd_s_w(vec16, vec16); - tmp7 = __msa_hadd_s_w(vec17, vec17); - vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - tmp0 = __msa_hadd_s_w(vec10, vec10); - tmp1 = __msa_hadd_s_w(vec11, vec11); - tmp2 = __msa_hadd_s_w(vec12, vec12); - tmp3 = __msa_hadd_s_w(vec13, vec13); - tmp0 = __msa_srai_w(tmp0, 6); - tmp1 = __msa_srai_w(tmp1, 6); - tmp2 = __msa_srai_w(tmp2, 6); - tmp3 = __msa_srai_w(tmp3, 6); - vec2 = vec4 * vec0; - vec6 = vec4 * vec1; - vec3 = vec8 * vec0; - vec7 = vec8 * vec1; - tmp8 = __msa_hadd_s_w(vec2, vec2); - tmp9 = __msa_hadd_s_w(vec6, vec6); - tmp10 = __msa_hadd_s_w(vec3, vec3); - tmp11 = __msa_hadd_s_w(vec7, vec7); - vec4 = vec5 * vec0; - vec8 = vec5 * vec1; - vec5 = vec9 * vec0; - vec9 = vec9 * vec1; - tmp12 = __msa_hadd_s_w(vec4, vec4); - tmp13 = __msa_hadd_s_w(vec8, vec8); - tmp14 = __msa_hadd_s_w(vec5, vec5); - tmp15 = __msa_hadd_s_w(vec9, vec9); - vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); - vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); - vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); - vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); - tmp4 = __msa_hadd_s_w(vec14, vec14); - tmp5 = __msa_hadd_s_w(vec15, vec15); - tmp6 = __msa_hadd_s_w(vec16, vec16); - tmp7 = __msa_hadd_s_w(vec17, vec17); - tmp4 = __msa_srai_w(tmp4, 6); - tmp5 = __msa_srai_w(tmp5, 6); - tmp6 = __msa_srai_w(tmp6, 6); - tmp7 = __msa_srai_w(tmp7, 6); - vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - vec10 = __msa_maxi_s_h(vec10, 0); - vec11 = __msa_maxi_s_h(vec11, 0); - vec12 = __msa_maxi_s_h(vec12, 0); - vec13 = __msa_maxi_s_h(vec13, 0); - vec10 = __msa_min_s_h(vec10, max); - vec11 = __msa_min_s_h(vec11, max); - vec12 = __msa_min_s_h(vec12, max); - vec13 = __msa_min_s_h(vec13, max); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void SplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)src_uv, 0); - src1 = (v16u8)__msa_ld_b((void*)src_uv, 16); - src2 = (v16u8)__msa_ld_b((void*)src_uv, 32); - src3 = (v16u8)__msa_ld_b((void*)src_uv, 48); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_u, 16); - ST_UB2(dst2, dst3, dst_v, 16); - src_uv += 64; - dst_u += 32; - dst_v += 32; - } -} - -void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { - int x; - v16u8 dst0 = (v16u8)__msa_fill_b(v8); - - for (x = 0; x < width; x += 16) { - ST_UB(dst0, dst); - dst += 16; - } -} - -void MirrorSplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; - v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; - - src_uv += (2 * width); - - for (x = 0; x < width; x += 32) { - src_uv -= 64; - src2 = (v16u8)__msa_ld_b((void*)src_uv, 0); - src3 = (v16u8)__msa_ld_b((void*)src_uv, 16); - src0 = (v16u8)__msa_ld_b((void*)src_uv, 32); - src1 = (v16u8)__msa_ld_b((void*)src_uv, 48); - dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_v, 16); - ST_UB2(dst2, dst3, dst_u, 16); - dst_u += 32; - dst_v += 32; - } -} - -void SobelXRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int32_t width) { - int x; - v16u8 src0, src1, src2, src3, src4, src5, dst0; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; - v16i8 tmp = __msa_ldi_b(8); - v16i8 mask1 = mask0 + tmp; - v8i16 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_y0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_y1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_y1, 16); - src4 = (v16u8)__msa_ld_b((void*)src_y2, 0); - src5 = (v16u8)__msa_ld_b((void*)src_y2, 16); - vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); - vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); - vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); - vec0 += vec2; - vec1 += vec3; - vec4 += vec2; - vec5 += vec3; - vec0 += vec4; - vec1 += vec5; - vec0 = __msa_add_a_h(zero, vec0); - vec1 = __msa_add_a_h(zero, vec1); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_sobelx); - src_y0 += 16; - src_y1 += 16; - src_y2 += 16; - dst_sobelx += 16; - } -} - -void SobelYRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int32_t width) { - int x; - v16u8 src0, src1, dst0; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; - v8i16 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_y1, 0); - vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); - vec0 -= vec2; - vec1 -= vec3; - vec6[0] = src_y0[16] - src_y1[16]; - vec6[1] = src_y0[17] - src_y1[17]; - vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); - vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); - vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); - vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); - vec0 += vec2; - vec1 += vec3; - vec4 += vec2; - vec5 += vec3; - vec0 += vec4; - vec1 += vec5; - vec0 = __msa_add_a_h(zero, vec0); - vec1 = __msa_add_a_h(zero, vec1); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_sobely); - src_y0 += 16; - src_y1 += 16; - dst_sobely += 16; - } -} - -void HalfFloatRow_MSA(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - int i; - v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; - v4f32 mult_vec; - v8i16 zero = {0}; - mult_vec[0] = 1.9259299444e-34f * scale; - mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); - - for (i = 0; i < width; i += 32) { - src0 = (v8u16)__msa_ld_h((void*)src, 0); - src1 = (v8u16)__msa_ld_h((void*)src, 16); - src2 = (v8u16)__msa_ld_h((void*)src, 32); - src3 = (v8u16)__msa_ld_h((void*)src, 48); - vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); - vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); - vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); - vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); - vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); - vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); - vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); - vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); - fvec0 = __msa_ffint_u_w(vec0); - fvec1 = __msa_ffint_u_w(vec1); - fvec2 = __msa_ffint_u_w(vec2); - fvec3 = __msa_ffint_u_w(vec3); - fvec4 = __msa_ffint_u_w(vec4); - fvec5 = __msa_ffint_u_w(vec5); - fvec6 = __msa_ffint_u_w(vec6); - fvec7 = __msa_ffint_u_w(vec7); - fvec0 *= mult_vec; - fvec1 *= mult_vec; - fvec2 *= mult_vec; - fvec3 *= mult_vec; - fvec4 *= mult_vec; - fvec5 *= mult_vec; - fvec6 *= mult_vec; - fvec7 *= mult_vec; - vec0 = ((v4u32)fvec0) >> 13; - vec1 = ((v4u32)fvec1) >> 13; - vec2 = ((v4u32)fvec2) >> 13; - vec3 = ((v4u32)fvec3) >> 13; - vec4 = ((v4u32)fvec4) >> 13; - vec5 = ((v4u32)fvec5) >> 13; - vec6 = ((v4u32)fvec6) >> 13; - vec7 = ((v4u32)fvec7) >> 13; - dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); - dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); - dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - ST_UH2(dst0, dst1, dst, 8); - ST_UH2(dst2, dst3, dst + 16, 8); - src += 32; - dst += 32; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/drivers/media/pci/tbscapture2/row_neon.c b/drivers/media/pci/tbscapture2/row_neon.c deleted file mode 100644 index 30985324d323..000000000000 --- a/drivers/media/pci/tbscapture2/row_neon.c +++ /dev/null @@ -1,3981 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are -// reserved. - -// q0: Y uint16x8_t -// d2: U uint8x8_t -// d3: V uint8x8_t - -// Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.32 {d2[0]}, [%[src_u]]! \n" \ - "vld1.32 {d2[1]}, [%[src_v]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmovl.u8 q1, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsli.u16 q1, q1, #8 \n" - -// Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_u]]! \n" \ - "vmovl.u8 q0, d0 \n" \ - "vld1.8 {d3}, [%[src_v]]! \n" \ - "vsli.u16 q0, q0, #8 \n" - -// Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vmov.u8 q1, #128 \n" \ - "vmovl.u8 q0, d0 \n" \ - "vsli.u16 q0, q0, #8 \n" - -// Read 8 Y and 4 UV from NV12 -#define READNV12 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_uv]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ - "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ - -// Read 8 Y and 4 VU from NV21 -#define READNV21 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_vu]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ - "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ - -// Read 8 YUY2 -#define READYUY2 \ - "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ - "vmovl.u8 q0, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vsli.u16 q0, q0, #8 \n" \ - "vsli.u16 d2, d2, #8 \n" \ - "vsri.u16 d3, d3, #8 \n" - -// Read 8 UYVY -#define READUYVY \ - "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ - "vmovl.u8 q0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vsli.u16 q0, q0, #8 \n" \ - "vsli.u16 d2, d2, #8 \n" \ - "vsri.u16 d3, d3, #8 \n" - -// TODO: Use single register for kUVCoeff and multiply by lane -#define YUVTORGB_SETUP \ - "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \ - "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ - "vdup.u16 q10, d31[1] \n" \ - "vdup.u16 q11, d31[2] \n" \ - "vdup.u16 q12, d31[3] \n" \ - "vdup.u16 d31, d31[0] \n" - -// q0: B uint16x8_t -// q1: G uint16x8_t -// q2: R uint16x8_t - -// Convert from YUV to 2.14 fixed point RGB -#define YUVTORGB \ - "vmull.u16 q2, d1, d31 \n" \ - "vmull.u8 q8, d3, d29 \n" /* DGV */ \ - "vmull.u16 q0, d0, d31 \n" \ - "vmlal.u8 q8, d2, d28 \n" /* DG */ \ - "vqshrn.u32 d0, q0, #16 \n" \ - "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ - "vmull.u8 q9, d2, d26 \n" /* DB */ \ - "vmull.u8 q2, d3, d27 \n" /* DR */ \ - "vadd.u16 q4, q0, q11 \n" /* G */ \ - "vadd.u16 q2, q0, q2 \n" /* R */ \ - "vadd.u16 q0, q0, q9 \n" /* B */ \ - "vqsub.u16 q1, q4, q8 \n" /* G */ \ - "vqsub.u16 q0, q0, q10 \n" /* B */ \ - "vqsub.u16 q2, q2, q12 \n" /* R */ - -// Convert from 2.14 fixed point RGB To 8 bit RGB -#define RGBTORGB8 \ - "vqshrn.u16 d4, q2, #6 \n" /* R */ \ - "vqshrn.u16 d2, q1, #6 \n" /* G */ \ - "vqshrn.u16 d0, q0, #6 \n" /* B */ - -#define YUVTORGB_REGS \ - "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" - -#define STORERGBA \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d4 \n" \ - "vmov.u8 d0, d6 \n" \ - "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" - -void I444ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I444ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void I422ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I444AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422ToRGBARow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTORGB565 \ - "vshll.u8 q2, d4, #8 \n" /* R */ \ - "vshll.u8 q1, d2, #8 \n" /* G */ \ - "vshll.u8 q0, d0, #8 \n" /* B */ \ - "vsri.16 q2, q1, #5 \n" /* RG */ \ - "vsri.16 q2, q0, #11 \n" /* RGB */ - -void I422ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 - "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTOARGB1555 \ - "vshll.u8 q3, d6, #8 \n" /* A */ \ - "vshll.u8 q2, d4, #8 \n" /* R */ \ - "vshll.u8 q1, d2, #8 \n" /* G */ \ - "vshll.u8 q0, d0, #8 \n" /* B */ \ - "vsri.16 q3, q2, #1 \n" /* AR */ \ - "vsri.16 q3, q1, #6 \n" /* ARG */ \ - "vsri.16 q3, q0, #11 \n" /* ARGB */ - -void I422ToARGB1555Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 - "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "q3"); -} - -#define ARGBTOARGB4444 \ - "vshr.u8 d0, d0, #4 \n" /* B */ \ - "vbic.32 d2, d2, d7 \n" /* G */ \ - "vshr.u8 d4, d4, #4 \n" /* R */ \ - "vbic.32 d6, d6, d7 \n" /* A */ \ - "vorr d0, d0, d2 \n" /* BG */ \ - "vorr d1, d4, d6 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ - -void I422ToARGB4444Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "vmov.u8 d7, #0x0f \n" // vbic bits to clear - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "q3"); -} - -void I400ToARGBRow_NEON(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV400 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d23, #255 \n" - "1: \n" - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d20", "d21", "d22", "d23"); -} - -void NV12ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void NV21ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_vu] "+r"(src_vu), // %[src_vu] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void NV12ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void NV21ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_vu] "+r"(src_vu), // %[src_vu] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void NV12ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" ARGBTORGB565 - "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store U - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Reads 16 byte Y's from tile and writes out 16 Y's. -// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes -// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes -// width measured in bytes so 8 UV = 16. -void DetileRow_NEON(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes - "subs %2, %2, #16 \n" // 16 processed per loop - "pld [%0, #1792] \n" - "vst1.8 {q0}, [%1]! \n" // store 16 bytes - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "q0" // Clobber List - ); -} - -// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. -void DetileRow_16_NEON(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width) { - asm volatile ( - "1: \n" - "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "pld [%0, #3584] \n" - "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride * 2) // %3 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. -void DetileSplitUVRow_NEON(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "vld2.8 {d0, d1}, [%0], %4 \n" - "subs %3, %3, #16 \n" - "pld [%0, #1792] \n" - "vst1.8 {d0}, [%1]! \n" - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(src_tile_stride) // %4 - : "cc", "memory", "d0", "d1" // Clobber List - ); -} - -#if defined(LIBYUV_USE_ST2) -// Read 16 Y, 8 UV, and write 8 YUYV. -void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y - "pld [%0, #1792] \n" - "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV - "pld [%1, #1792] \n" - "subs %3, %3, #16 \n" - "vst2.8 {q0, q1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list - ); -} -#else -// Read 16 Y, 8 UV, and write 8 YUYV. -void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y - "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV - "subs %3, %3, #16 \n" - "pld [%0, #1792] \n" - "vzip.8 q0, q1 \n" - "pld [%1, #1792] \n" - "vst1.8 {q0, q1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list - ); -} -#endif - -void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile ( - "1: \n" - "vld1.8 {q14}, [%0]! \n" // Load lower bits. - "vld1.8 {q9}, [%0]! \n" // Load upper bits row - // by row. - "vld1.8 {q11}, [%0]! \n" - "vld1.8 {q13}, [%0]! \n" - "vld1.8 {q15}, [%0]! \n" - "vshl.u8 q8, q14, #6 \n" // Shift lower bit data - // appropriately. - "vshl.u8 q10, q14, #4 \n" - "vshl.u8 q12, q14, #2 \n" - "vzip.u8 q8, q9 \n" // Interleave upper and - // lower bits. - "vzip.u8 q10, q11 \n" - "vzip.u8 q12, q13 \n" - "vzip.u8 q14, q15 \n" - "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits - // into lower 6 bits for - // better accuracy in - // conversions. - "vsri.u16 q9, q9, #10 \n" - "vsri.u16 q10, q10, #10 \n" - "vsri.u16 q11, q11, #10 \n" - "vsri.u16 q12, q12, #10 \n" - "vsri.u16 q13, q13, #10 \n" - "vsri.u16 q14, q14, #10 \n" - "vsri.u16 q15, q15, #10 \n" - "vstmia %1!, {q8-q15} \n" // Store pixel block (64 - // pixels). - "subs %2, %2, #80 \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(size) // %2 - : - : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load U - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. -void SplitRGBRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile ( - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB - "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store R - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%3]! \n" // store B - "bgt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time -void MergeRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q2}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB - "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. -void SplitARGBRow_NEON(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB - "subs %5, %5, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%3]! \n" // store B - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%1]! \n" // store R - "vst1.8 {q3}, [%4]! \n" // store A - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q0}, [%2]! \n" // load B - "vld1.8 {q3}, [%3]! \n" // load A - "subs %5, %5, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. -void SplitXRGBRow_NEON(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%3]! \n" // store B - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%1]! \n" // store R - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time -void MergeXRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 q3, #255 \n" // load A(255) - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q0}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void MergeXR30Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - int shift = 10 - depth; - asm volatile ( - "vmov.u32 q14, #1023 \n" - "vdup.32 q15, %5 \n" - "1: \n" - "vld1.16 {d4}, [%2]! \n" // B - "vld1.16 {d2}, [%1]! \n" // G - "vld1.16 {d0}, [%0]! \n" // R - "vmovl.u16 q2, d4 \n" // B - "vmovl.u16 q1, d2 \n" // G - "vmovl.u16 q0, d0 \n" // R - "vshl.u32 q2, q2, q15 \n" // 000B - "vshl.u32 q1, q1, q15 \n" - "vshl.u32 q0, q0, q15 \n" - "vmin.u32 q2, q2, q14 \n" - "vmin.u32 q1, q1, q14 \n" - "vmin.u32 q0, q0, q14 \n" - "vsli.u32 q2, q1, #10 \n" // 00GB - "vsli.u32 q2, q0, #20 \n" // 0RGB - "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" - "vst1.8 {q2}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); -} - -void MergeXR30Row_10_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int /* depth */, - int width) { - asm volatile ( - "vmov.u32 q14, #1023 \n" - "1: \n" - "vld1.16 {d4}, [%2]! \n" // B - "vld1.16 {d2}, [%1]! \n" // G - "vld1.16 {d0}, [%0]! \n" // R - "vmovl.u16 q2, d4 \n" // 000B - "vmovl.u16 q1, d2 \n" // G - "vmovl.u16 q0, d0 \n" // R - "vmin.u32 q2, q2, q14 \n" - "vmin.u32 q1, q1, q14 \n" - "vmin.u32 q0, q0, q14 \n" - "vsli.u32 q2, q1, #10 \n" // 00GB - "vsli.u32 q2, q0, #20 \n" // 0RGB - "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" - "vst1.8 {q2}, [%3]! \n" - "bgt 1b \n" - "3: \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "q0", "q1", "q2", "q14"); -} - -void MergeAR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile ( - - "vdup.u16 q15, %6 \n" - "vdup.u16 q14, %7 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vld1.16 {q3}, [%3]! \n" // A - "vmin.u16 q2, q2, q14 \n" - "vmin.u16 q1, q1, q14 \n" - "vmin.u16 q0, q0, q14 \n" - "vmin.u16 q3, q3, q14 \n" - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vshl.u16 q3, q3, q15 \n" - "subs %5, %5, #8 \n" - "vst4.16 {d0, d2, d4, d6}, [%4]! \n" - "vst4.16 {d1, d3, d5, d7}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_ar64), // %4 - "+r"(width) // %5 - : "r"(shift), // %6 - "r"(mask) // %7 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeXR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile ( - - "vmov.u8 q3, #0xff \n" // A (0xffff) - "vdup.u16 q15, %5 \n" - "vdup.u16 q14, %6 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vmin.u16 q2, q2, q14 \n" - "vmin.u16 q1, q1, q14 \n" - "vmin.u16 q0, q0, q14 \n" - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "subs %4, %4, #8 \n" - "vst4.16 {d0, d2, d4, d6}, [%3]! \n" - "vst4.16 {d1, d3, d5, d7}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar64), // %3 - "+r"(width) // %4 - : "r"(shift), // %5 - "r"(mask) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeARGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile ( - - "vdup.16 q15, %6 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vld1.16 {q3}, [%3]! \n" // A - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vshl.u16 q3, q3, q15 \n" - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d1, q1 \n" - "vqmovn.u16 d2, q2 \n" - "vqmovn.u16 d3, q3 \n" - "subs %5, %5, #8 \n" - "vst4.8 {d0, d1, d2, d3}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : "r"(shift) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeXRGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile ( - - "vdup.16 q15, %5 \n" - "vmov.u8 d6, #0xff \n" // A (0xff) - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vqmovn.u16 d5, q2 \n" - "vqmovn.u16 d4, q1 \n" - "vqmovn.u16 d3, q0 \n" - "subs %4, %4, #8 \n" - "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); -} - -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// SetRow writes 'width' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile ( - "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v8) // %2 - : "cc", "memory", "q0"); -} - -// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. -void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v32) // %2 - : "cc", "memory", "q0"); -} - -void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %2 \n" - "sub %0, %0, #32 \n" // 32 bytes per loop - - "1: \n" - "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 - "subs %2, #32 \n" // 32 pixels per loop. - "vrev64.8 q0, q2 \n" - "vrev64.8 q1, q1 \n" - "vswp d0, d1 \n" - "vswp d2, d3 \n" - "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(-32) // %3 - : "cc", "memory", "q0", "q1", "q2"); -} - -void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile ( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %2, lsl #1 \n" - "sub %0, #16 \n" - - "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r12", "q0"); -} - -void MirrorSplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" - - "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1]! \n" // dst += 8 - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0"); -} - -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( - "add %0, %0, %2, lsl #2 \n" - "sub %0, #32 \n" - - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vrev64.8 d3, d3 \n" - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(-32) // %3 - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - src_rgb24 += width * 3 - 24; - asm volatile ( - "1: \n" - "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "r"(-24) // %3 - : "cc", "memory", "d0", "d1", "d2"); -} - -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile ( - "vmov.u8 d0, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} -void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of - // RGB24. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3" // Clobber List - ); -} - -#define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ - -void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -#define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ - -// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ - -void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -#define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ - -void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void ARGBToRGB24Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels. - "vst3.8 {d1, d3, d5}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile ( - "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d1}, [%1]! \n" // store 8 U. - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d0}, [%1]! \n" // store 8 U. - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.8 {d1}, [%2]! \n" // store 8 U. - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.8 {d0}, [%2]! \n" // store 8 U. - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7" // Clobber List - ); -} - -void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 q4, q1, q3 \n" // average rows of UV - "vst1.8 {q4}, [%2]! \n" // store 8 UV. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7" // Clobber List - ); -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile ( - "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - "vld1.8 {d1}, [%1]! \n" // load 8 Us - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void I422ToUYVYRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile ( - "1: \n" - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - "vld1.8 {d0}, [%1]! \n" // load 8 Us - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void ARGBToRGB565Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb565, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "d6"); -} - -void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - asm volatile ( - "vdup.32 d7, %2 \n" // dither4 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d7 \n" - "vqadd.u8 d2, d2, d7 \n" - "vqadd.u8 d4, d4, d7 \n" // add for dither - ARGBTORGB565 - "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. - "bgt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb1555, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb4444, - int width) { - asm volatile ( - "vmov.u8 d7, #0x0f \n" // bits to clear with - // vbic. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -struct RgbUVConstants { - uint8_t kRGBToU[4]; - uint8_t kRGBToV[4]; -}; - -// 8x1 pixels. -static void ARGBToUV444MatrixRow_NEON( - const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstants* rgbuvconstants) { - asm volatile ( - - "vld1.8 {d0}, [%4] \n" // load rgbuvconstants - "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient - "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient - "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient - "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient - "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - - "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned - "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned - - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(rgbuvconstants) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", - "q15"); -} - -// RGB to bt601 coefficients -// UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = 74 -// UR -0.2969 coefficient = 38 -// VB -0.1406 coefficient = 18 -// VG -0.7344 coefficient = 94 -// VR 0.875 coefficient = 112 (ignored) - -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, - {18, 94, 112, 0}}; - -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); -} - -// clang-format off -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \ - "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */ -// clang-format on - -// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// TODO(fbarchard): Subsample match Intel code. -void ARGBToUVJRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void ABGRToUVJRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_uj, - uint8_t* dst_vj, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_stride_abgr), // %1 - "+r"(dst_uj), // %2 - "+r"(dst_vj), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// TODO(fbarchard): Subsample match C code. -void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_stride_rgb24), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// TODO(fbarchard): Subsample match C code. -void RAWToUVJRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_stride_raw), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void BGRAToUVRow_NEON(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q3, q2, q1) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(src_stride_bgra), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void ABGRToUVRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_stride_abgr), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RGBAToUVRow_NEON(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(src_stride_rgba), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_stride_rgb24), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RAWToUVRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_stride_raw), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q0, q4, #1 \n" // 2x average - "vrshr.u16 q1, q5, #1 \n" - "vrshr.u16 q2, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - asm volatile ( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - asm volatile ( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -void ARGBToAR64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q2}, [%0]! \n" - "vmov.u8 q1, q0 \n" - "vmov.u8 q3, q2 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels - "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - -void ARGBToAB64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile ( - "vld1.8 {q4}, [%3] \n" // shuffler - - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q2}, [%0]! \n" - "vtbl.8 d2, {d0, d1}, d8 \n" - "vtbl.8 d3, {d0, d1}, d9 \n" - "vtbl.8 d6, {d4, d5}, d8 \n" - "vtbl.8 d7, {d4, d5}, d9 \n" - "vmov.u8 q0, q1 \n" - "vmov.u8 q2, q3 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels - "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "r"(&kShuffleARGBToABGR) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -void AR64ToARGBRow_NEON(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile ( - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vld1.16 {q3}, [%0]! \n" - "vshrn.u16 d0, q0, #8 \n" - "vshrn.u16 d1, q1, #8 \n" - "vshrn.u16 d4, q2, #8 \n" - "vshrn.u16 d5, q3, #8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 4 pixels - "vst1.8 {q2}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; - -void AB64ToARGBRow_NEON(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vld1.8 {d8}, [%3] \n" // shuffler - - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vld1.16 {q3}, [%0]! \n" - "vtbl.8 d0, {d0, d1}, d8 \n" - "vtbl.8 d1, {d2, d3}, d8 \n" - "vtbl.8 d4, {d4, d5}, d8 \n" - "vtbl.8 d5, {d6, d7}, d8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 4 pixels - "vst1.8 {q2}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(&kShuffleAB64ToARGB) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; -}; - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; - -// ARGB expects first 3 values to contain RGB and 4th value is ignored. -static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop. - "vmull.u8 q8, d0, d20 \n" // B - "vmull.u8 q9, d1, d20 \n" - "vmlal.u8 q8, d2, d21 \n" // G - "vmlal.u8 q9, d3, d21 \n" - "vmlal.u8 q8, d4, d22 \n" // R - "vmlal.u8 q9, d5, d22 \n" - "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y - "vaddhn.u16 d1, q9, q12 \n" - "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", - "q12"); -} - -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); -} - -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} - -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); -} - -void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); -} - -// RGBA expects first value to be A and ignored, then 3 values to contain RGB. -// Same code as ARGB, except the LD4 -static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop. - "vmull.u8 q8, d2, d20 \n" // B - "vmull.u8 q9, d3, d20 \n" - "vmlal.u8 q8, d4, d21 \n" // G - "vmlal.u8 q9, d5, d21 \n" - "vmlal.u8 q8, d6, d22 \n" // R - "vmlal.u8 q9, d7, d22 \n" - "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y - "vaddhn.u16 d1, q9, q12 \n" - "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", - "q12"); -} - -void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); -} - -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); -} - -static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" - "1: \n" - "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of - // RGB24. - "vld3.8 {d3, d5, d7}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop. - "vmull.u8 q8, d2, d20 \n" // B - "vmull.u8 q9, d3, d20 \n" - "vmlal.u8 q8, d4, d21 \n" // G - "vmlal.u8 q9, d5, d21 \n" - "vmlal.u8 q8, d6, d22 \n" // R - "vmlal.u8 q9, d7, d22 \n" - "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y - "vaddhn.u16 d1, q9, q12 \n" - "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", - "q12"); -} - -void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); -} - -void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); -} - -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); -} - -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); -} - -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction) // %4 - : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_16_NEON(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "cmp %4, #128 \n" - "beq 50f \n" - - "vdup.16 d17, %4 \n" - "vdup.16 d16, %5 \n" - // General purpose row blend. - "1: \n" - "vld1.16 {q0}, [%1]! \n" - "vld1.16 {q1}, [%2]! \n" - "subs %3, %3, #8 \n" - "vmull.u16 q2, d0, d16 \n" - "vmull.u16 q3, d1, d16 \n" - "vmlal.u16 q2, d2, d17 \n" - "vmlal.u16 q3, d3, d17 \n" - "vrshrn.u32 d0, q2, #8 \n" - "vrshrn.u32 d1, q3, #8 \n" - "vst1.16 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "vld1.16 {q0}, [%1]! \n" - "vld1.16 {q1}, [%2]! \n" - "subs %3, %3, #8 \n" - "vrhadd.u16 q0, q1 \n" - "vst1.16 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "vld1.16 {q0}, [%1]! \n" - "subs %3, %3, #8 \n" - "vst1.16 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width) // %3 - : "r"(y1_fraction), // %4 - "r"(y0_fraction) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8"); -} - -// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - "subs %3, #8 \n" - "blt 89f \n" - // Blend 8 pixels. - "8: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" - - "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" - - // Blend 1 pixels. - "1: \n" - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" - - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); -} - -// Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u16 q15, #0x00ff \n" // 255 for rounding up - - // Attenuate 8 pixels. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8 - "vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8 - "vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8 - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15"); -} - -// Quantize 8 ARGB pixels (32 bytes). -// dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add - - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); -} - -// Shade 8 pixels at a time by specified value. -// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. -// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); -} - -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -// Similar to ARGBToYJ but stores ARGB. -// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; -void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", - "q14", "q15"); -} - -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// TODO(fbarchard): Was same as Sepia except matrix is provided. This function -// needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile ( - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {d0}, [%0],%5 \n" // top - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%2],%5 \n" // bottom - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {d0}, [%0],%4 \n" // left - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%0],%5 \n" // right - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// %y passes a float as a scalar vector for vector * scalar multiply. -// the regoster must be d0 to d15 and indexed with [0] or [1] to access -// the float in the first or second float of the d-reg - -void HalfFloat1Row_NEON(const uint16_t* src, - uint16_t* dst, - float /*unused*/, - int width) { - asm volatile ( - - "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(1.9259299444e-34f) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -void HalfFloatRow_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - asm volatile ( - - "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -void ByteToFloatRow_NEON(const uint8_t* src, - float* dst, - float scale, - int width) { - asm volatile ( - - "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 bytes - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q1, d2 \n" // 8 shorts - "vmovl.u16 q2, d2 \n" // 8 ints - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // scale - "vmul.f32 q3, q3, %y3 \n" - "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_NEON(const uint16_t* src0, - const uint16_t* src1, - const uint16_t* src2, - const uint16_t* src3, - const uint16_t* src4, - uint32_t* dst, - int width) { - asm volatile ( - "vmov.u16 d6, #4 \n" // constant 4 - "vmov.u16 d7, #6 \n" // constant 6 - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows - "vld1.16 {q2}, [%4]! \n" - "vaddl.u16 q0, d2, d4 \n" // * 1 - "vaddl.u16 q1, d3, d5 \n" // * 1 - "vld1.16 {q2}, [%1]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "vld1.16 {q2}, [%2]! \n" - "vmlal.u16 q0, d4, d7 \n" // * 6 - "vmlal.u16 q1, d5, d7 \n" // * 6 - "vld1.16 {q2}, [%3]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop - "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples - "bgt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { - const uint32_t* src1 = src + 1; - const uint32_t* src2 = src + 2; - const uint32_t* src3 = src + 3; - asm volatile ( - "vmov.u32 q10, #4 \n" // constant 4 - "vmov.u32 q11, #6 \n" // constant 6 - - "1: \n" - "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples - "vld1.32 {q2}, [%0] \n" - "vadd.u32 q0, q0, q1 \n" // * 1 - "vadd.u32 q1, q1, q2 \n" // * 1 - "vld1.32 {q2, q3}, [%2]! \n" - "vmla.u32 q0, q2, q11 \n" // * 6 - "vmla.u32 q1, q3, q11 \n" // * 6 - "vld1.32 {q2, q3}, [%1]! \n" - "vld1.32 {q8, q9}, [%3]! \n" - "vadd.u32 q2, q2, q8 \n" // add rows for * 4 - "vadd.u32 q3, q3, q9 \n" - "vmla.u32 q0, q2, q10 \n" // * 4 - "vmla.u32 q1, q3, q10 \n" // * 4 - "subs %5, %5, #8 \n" // 8 processed per loop - "vqshrn.u32 d0, q0, #8 \n" // round and pack - "vqshrn.u32 d1, q1, #8 \n" - "vst1.u16 {q0}, [%4]! \n" // store 8 samples - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} - -// Convert biplanar NV21 to packed YUV24 -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile ( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2"); -} - -void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV - // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV - // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV - // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average - "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_stride_ayuv), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -} - -void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV - // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV - // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV - // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average - "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_stride_ayuv), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -} - -// Copy row of AYUV Y's into Y. -// Similar to ARGBExtractAlphaRow_NEON -void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values - "vld2.8 {d1, d3}, [%0]! \n" - "vmov.u8 q2, q0 \n" // move U after V - "subs %2, %2, #16 \n" // 16 pixels per loop - "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2"); -} - -void HalfMergeUVRow_NEON(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - const uint8_t* src_u_1 = src_u + src_stride_u; - const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 U values - "vld1.8 {q1}, [%2]! \n" // load 16 V values - "vld1.8 {q2}, [%1]! \n" - "vld1.8 {q3}, [%3]! \n" - "vpaddl.u8 q0, q0 \n" // half size - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q1, q3 \n" - "vqrshrn.u16 d0, q0, #2 \n" - "vqrshrn.u16 d1, q1, #2 \n" - "subs %5, %5, #16 \n" // 16 src pixels per loop - "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_u_1), // %1 - "+r"(src_v), // %2 - "+r"(src_v_1), // %3 - "+r"(dst_uv), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void SplitUVRow_16_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - int shift = depth - 16; // Negative for right shift. - asm volatile ( - "vdup.16 q2, %4 \n" - "1: \n" - "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV - "vshl.u16 q0, q0, q2 \n" - "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop - "vst1.16 {q0}, [%1]! \n" // store 8 U pixels - "vst1.16 {q1}, [%2]! \n" // store 8 V pixels - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2"); -} - -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - asm volatile ( - "vdup.16 q2, %4 \n" - "1: \n" - "vld1.16 {q0}, [%0]! \n" // load 8 U - "vld1.16 {q1}, [%1]! \n" // load 8 V - "vshl.u16 q0, q0, q2 \n" - "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop - "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2"); -} - -void MultiplyRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile ( - "vdup.16 q2, %3 \n" - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vmul.u16 q0, q0, q2 \n" - "vmul.u16 q1, q1, q2 \n" - "vst1.16 {q0}, [%1]! \n" - "vst1.16 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" // 16 src pixels per loop - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2"); -} - -void DivideRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile ( - "vdup.16 d8, %3 \n" - "1: \n" - "vld1.16 {q2, q3}, [%0]! \n" - "vmull.u16 q0, d4, d8 \n" - "vmull.u16 q1, d5, d8 \n" - "vmull.u16 q2, d6, d8 \n" - "vmull.u16 q3, d7, d8 \n" - "vshrn.u32 d0, q0, #16 \n" - "vshrn.u32 d1, q1, #16 \n" - "vshrn.u32 d2, q2, #16 \n" - "vshrn.u32 d3, q3, #16 \n" - "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels - "subs %2, %2, #16 \n" // 16 src pixels per loop - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "d8"); -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits = shr 1 -// 16384 = 10 bits = shr 2 -// 4096 = 12 bits = shr 4 -// 256 = 16 bits = shr 8 -void Convert16To8Row_NEON(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - asm volatile ( - "vdup.16 q2, %3 \n" - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative - "vshl.u16 q1, q1, q2 \n" - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d1, q1 \n" - "subs %2, %2, #16 \n" // 16 src pixels per loop - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(shift) // %3 - : "cc", "memory", "q0", "q1", "q2"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/row_neon64.c b/drivers/media/pci/tbscapture2/row_neon64.c deleted file mode 100644 index 5e332b8e76a0..000000000000 --- a/drivers/media/pci/tbscapture2/row_neon64.c +++ /dev/null @@ -1,5364 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer -// STn over ZIP1+ST1 -// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// v0.8h: Y -// v1.16b: 8U, 8V - -// Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ldr s1, [%[src_u]], #4 \n" \ - "ldr s2, [%[src_v]], #4 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v1.8b, v1.8b, v1.8b \n" \ - "zip1 v2.8b, v2.8b, v2.8b \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" - -// Read 8 Y, 4 U and 4 V from 210 -#define READYUV210 \ - "ldr q2, [%[src_y]], #16 \n" \ - "ldr d1, [%[src_u]], #8 \n" \ - "ldr d3, [%[src_v]], #8 \n" \ - "shl v0.8h, v2.8h, #6 \n" \ - "usra v0.8h, v2.8h, #4 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v2.8h, v3.8h, v3.8h \n" \ - "zip1 v3.8h, v1.8h, v1.8h \n" \ - "uqshrn v1.8b, v3.8h, #2 \n" \ - "uqshrn2 v1.16b, v2.8h, #2 \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" - -// Read 8 Y, 4 U and 4 V interleaved from 210 -#define READYUVP210 \ - "ldr q0, [%[src_y]], #16 \n" \ - "ldr q1, [%[src_uv]], #16 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "tbl v1.16b, {v1.16b}, v2.16b \n" - -// Read 8 Y, 4 U and 4 V from 212 -#define READYUV212 \ - "ldr q2, [%[src_y]], #16 \n" \ - "ldr d1, [%[src_u]], #8 \n" \ - "ldr d3, [%[src_v]], #8 \n" \ - "shl v0.8h, v2.8h, #4 \n" \ - "usra v0.8h, v2.8h, #8 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v2.8h, v3.8h, v3.8h \n" \ - "zip1 v3.8h, v1.8h, v1.8h \n" \ - "uqshrn v1.8b, v3.8h, #4 \n" \ - "uqshrn2 v1.16b, v2.8h, #4 \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" - -// Read 8 Y, 8 U and 8 V from 410 -#define READYUV410 \ - "ldr q1, [%[src_y]], #16 \n" \ - "ldr q2, [%[src_u]], #16 \n" \ - "ldr q3, [%[src_v]], #16 \n" \ - "shl v0.8h, v1.8h, #6 \n" \ - "usra v0.8h, v1.8h, #4 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "uqshrn v1.8b, v2.8h, #2 \n" \ - "uqshrn2 v1.16b, v3.8h, #2 \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" - -// Read 8 Y, 8 U and 8 V interleaved from 410 -#define READYUVP410 \ - "ldr q0, [%[src_y]], #16 \n" \ - "ldp q4, q5, [%[src_uv]], #32 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "tbl v1.16b, {v4.16b, v5.16b}, v2.16b \n" - -// Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ldr d1, [%[src_u]], #8 \n" \ - "ldr d2, [%[src_v]], #8 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_u], 448] \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_v], 448] \n" - -// Read 8 Y -#define READYUV400 \ - "ldr d0, [%[src_y]], #8 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" - -static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, - 1, 1, 3, 3, 5, 5, 7, 7}; -static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8, 8, 12, 12, - 2, 2, 6, 6, 10, 10, 14, 14}; -static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, - 0, 0, 2, 2, 4, 4, 6, 6}; -static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, - 3, 3, 7, 7, 11, 11, 15, 15}; - -// Read 8 Y and 4 UV from NV12 or NV21 -#define READNV12 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ldr d1, [%[src_uv]], #8 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "tbl v1.16b, {v1.16b}, v2.16b \n" \ - "prfm pldl1keep, [%[src_uv], 448] \n" - -// Read 8 YUY2 -#define READYUY2 \ - "ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \ - "trn1 v0.16b, v3.16b, v3.16b \n" \ - "prfm pldl1keep, [%[src_yuy2], 448] \n" \ - "tbl v1.16b, {v3.16b}, v2.16b \n" - -// Read 8 UYVY -#define READUYVY \ - "ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \ - "trn2 v0.16b, v3.16b, v3.16b \n" \ - "prfm pldl1keep, [%[src_uyvy], 448] \n" \ - "tbl v1.16b, {v3.16b}, v2.16b \n" - -// UB VR UG VG -// YG BB BG BR -#define YUVTORGB_SETUP \ - "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ - "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" - -// v16.8h: B -// v17.8h: G -// v18.8h: R - -// Convert from YUV (NV12 or NV21) to 2.14 fixed point RGB. -// Similar to I4XXTORGB but U/V components are in the low/high halves of v1. -#define NVTORGB \ - "umull2 v3.4s, v0.8h, v24.8h \n" \ - "umull v6.8h, v1.8b, v30.8b \n" \ - "umull v0.4s, v0.4h, v24.4h \n" \ - "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ - "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ - "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ - "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ - "add v17.8h, v0.8h, v26.8h \n" /* G */ \ - "add v16.8h, v0.8h, v4.8h \n" /* B */ \ - "add v18.8h, v0.8h, v5.8h \n" /* R */ \ - "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ - "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ - "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ - -// Convert from YUV (I444 or I420) to 2.14 fixed point RGB. -// Similar to NVTORGB but U/V components are in v1/v2. -#define I4XXTORGB \ - "umull2 v3.4s, v0.8h, v24.8h \n" \ - "umull v6.8h, v1.8b, v30.8b \n" \ - "umull v0.4s, v0.4h, v24.4h \n" \ - "umlal v6.8h, v2.8b, v31.8b \n" /* DG */ \ - "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ - "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ - "umull v5.8h, v2.8b, v29.8b \n" /* DR */ \ - "add v17.8h, v0.8h, v26.8h \n" /* G */ \ - "add v16.8h, v0.8h, v4.8h \n" /* B */ \ - "add v18.8h, v0.8h, v5.8h \n" /* R */ \ - "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ - "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ - "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ - -// Convert from YUV I400 to 2.14 fixed point RGB -#define I400TORGB \ - "umull2 v3.4s, v0.8h, v24.8h \n" \ - "umull v0.4s, v0.4h, v24.4h \n" \ - "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \ - "add v17.8h, v0.8h, v26.8h \n" /* G */ \ - "add v16.8h, v0.8h, v4.8h \n" /* B */ \ - "add v18.8h, v0.8h, v5.8h \n" /* R */ \ - "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ - "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ - "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ - -// Convert from 2.14 fixed point RGB To 8 bit RGB -#define RGBTORGB8 \ - "uqshrn v17.8b, v17.8h, #6 \n" \ - "uqshrn v16.8b, v16.8h, #6 \n" \ - "uqshrn v18.8b, v18.8h, #6 \n" - -// Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the -// top half of each lane. -#define RGBTORGB8_TOP \ - "uqshl v17.8h, v17.8h, #2 \n" \ - "uqshl v16.8h, v16.8h, #2 \n" \ - "uqshl v18.8h, v18.8h, #2 \n" - -// Store 2.14 fixed point RGB as AR30 elements -#define STOREAR30 \ - /* Inputs: \ - * v16.8h: xxbbbbbbbbbbxxxx \ - * v17.8h: xxggggggggggxxxx \ - * v18.8h: xxrrrrrrrrrrxxxx \ - * v22.8h: 0011111111110000 (umin limit) \ - * v23.8h: 1100000000000000 (alpha) \ - */ \ - "uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \ - "uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \ - "umin v6.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \ - "shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \ - "orr v5.16b, v6.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \ - "sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \ - "sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \ - "st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n" - -#define YUVTORGB_REGS \ - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \ - "v25", "v26", "v27", "v28", "v29", "v30", "v31" - -void I444ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV444 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I444ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV444 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void I210ToAR30Row_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - uint16_t limit = 0x3ff0; - uint16_t alpha = 0xc000; - asm volatile (YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [alpha] "r"(alpha) // %[alpha] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void I410ToAR30Row_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - uint16_t limit = 0x3ff0; - uint16_t alpha = 0xc000; - asm volatile (YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [alpha] "r"(alpha) // %[alpha] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void I212ToAR30Row_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - const uint16_t limit = 0x3ff0; - asm volatile ( - YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit) // %[limit] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void I210ToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV210 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I410ToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV410 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I212ToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV212 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9, 9, 13, 13, - 3, 3, 7, 7, 11, 11, 15, 15}; - -void P210ToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kIndices]] \n" - "1: \n" // - READYUVP210 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -uint8_t kP410LoadShuffleIndices[] = {1, 5, 9, 13, 17, 21, 25, 29, - 3, 7, 11, 15, 19, 23, 27, 31}; - -void P410ToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kIndices]] \n" - "1: \n" // - READYUVP410 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void P210ToAR30Row_NEON(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - const uint16_t limit = 0x3ff0; - asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void P410ToAR30Row_NEON(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - uint16_t limit = 0x3ff0; - asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void I422ToAR30Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - const uvec8* uv_coeff = &yuvconstants->kUVCoeff; - const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - const uint16_t limit = 0x3ff0; - asm volatile ( - YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV422 I4XXTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit) // %[limit] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); -} - -void I444AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 - "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I410AlphaToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile (YUVTORGB_SETUP - "1: \n" - "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 - "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I210AlphaToARGBRow_NEON(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile (YUVTORGB_SETUP - "1: \n" - "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 - "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 - "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422ToRGBARow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v15.8b, #255 \n" /* A */ - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v15"); -} - -void I422ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTORGB565 \ - /* Inputs: \ - * v16: bbbbbxxx \ - * v17: ggggggxx \ - * v18: rrrrrxxx */ \ - "shll v18.8h, v18.8b, #8 \n" /* rrrrrrxx00000000 */ \ - "shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \ - "shll v16.8h, v16.8b, #8 \n" /* bbbbbbxx00000000 */ \ - "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \ - "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */ - -#define ARGBTORGB565_FROM_TOP \ - /* Inputs: \ - * v16: bbbbbxxxxxxxxxxx \ - * v17: ggggggxxxxxxxxxx \ - * v18: rrrrrxxxxxxxxxxx */ \ - "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000 */ \ - "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb */ - -void I422ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP - "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTOARGB1555 \ - /* Inputs: \ - * v16: gggggxxxbbbbbxxx v17: axxxxxxxrrrrrxxx */ \ - "shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \ - "shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \ - "sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \ - "sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \ - "sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */ - -#define ARGBTOARGB1555_FROM_TOP \ - /* Inputs: \ - * v16: bbbbbxxxxxxxxxxx v17: gggggxxxxxxxxxxx \ - * v18: rrrrrxxxxxxxxxxx v19: axxxxxxxxxxxxxxx */ \ - "sri v19.8h, v18.8h, #1 \n" /* arrrrrxxxxxxxxxx */ \ - "sri v19.8h, v17.8h, #6 \n" /* arrrrrgggggxxxxx */ \ - "sri v19.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */ - -void I422ToARGB1555Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8h, #0x80, lsl #8 \n" - "1: \n" // - READYUV422 I4XXTORGB RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" // - ARGBTOARGB1555_FROM_TOP - "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -#define ARGBTOARGB4444 \ - /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \ - "sri v17.8b, v16.8b, #4 \n" /* BG */ \ - "sri v19.8b, v18.8b, #4 \n" /* RA */ \ - "zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */ - -void I422ToARGB4444Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "movi v19.8b, #255 \n" ARGBTOARGB4444 - "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 - // pixels - // ARGB4444. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I400ToARGBRow_NEON(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v1.16b, #128 \n" - "movi v19.8b, #255 \n" - "umull v6.8h, v1.8b, v30.8b \n" - "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ - "umull v4.8h, v1.8b, v28.8b \n" /* DB */ - "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ - "1: \n" READYUV400 I400TORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -#if defined(LIBYUV_USE_ST4) -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( - "movi v23.8b, #255 \n" - "1: \n" - "ld1 {v20.8b}, [%0], #8 \n" - "prfm pldl1keep, [%0, 448] \n" - "mov v21.8b, v20.8b \n" - "mov v22.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v20", "v21", "v22", "v23"); -} -#else -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( - "movi v20.8b, #255 \n" - "1: \n" - "ldr d16, [%0], #8 \n" - "subs %w2, %w2, #8 \n" - "zip1 v18.16b, v16.16b, v16.16b \n" // YY - "zip1 v19.16b, v16.16b, v20.16b \n" // YA - "prfm pldl1keep, [%0, 448] \n" - "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA - "zip2 v17.16b, v18.16b, v19.16b \n" - "stp q16, q17, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v16", "v17", "v18", "v19", "v20"); -} -#endif // LIBYUV_USE_ST4 - -void NV12ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void NV21ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_vu), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV21Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void NV12ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void NV21ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_vu), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV21Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void NV12ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB - RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP - "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 - // pixels - // RGB565. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV21InterleavedTable]] \n" - "1: \n" READYUY2 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV21InterleavedTable] "r"(&kNV21InterleavedTable) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12InterleavedTable]] \n" - "1: \n" READUYVY NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12InterleavedTable] "r"(&kNV12InterleavedTable) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store U - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// Reads 16 byte Y's from tile and writes out 16 Y's. -// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes -// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes -// width measured in bytes so 8 UV = 16. -void DetileRow_NEON(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead - "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "v0" // Clobber List - ); -} - -// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. -void DetileRow_16_NEON(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead - "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride * 2) // %3 - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. -void DetileSplitUVRow_NEON(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "ld2 {v0.8b,v1.8b}, [%0], %4 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%0, 1792] \n" - "st1 {v0.8b}, [%1], #8 \n" - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(src_tile_stride) // %4 - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -#if defined(LIBYUV_USE_ST2) -// Read 16 Y, 8 UV, and write 8 YUY2 -void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys - "prfm pldl1keep, [%0, 1792] \n" - "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs - "prfm pldl1keep, [%1, 1792] \n" - "subs %w3, %w3, #16 \n" // store 8 YUY2 - "st2 {v0.16b,v1.16b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "v0", "v1" // Clobber list - ); -} -#else -// Read 16 Y, 8 UV, and write 8 YUY2 -void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys - "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%0, 1792] \n" - "zip1 v2.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 1792] \n" - "zip2 v3.16b, v0.16b, v1.16b \n" - "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list - ); -} -#endif - -// Unpack MT2T into tiled P010 64 pixels at a time. See -// tinyurl.com/mtk-10bit-video-format for format documentation. -void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile ( - "1: \n" - "ld1 {v7.16b}, [%0], #16 \n" - "ld1 {v0.16b-v3.16b}, [%0], #64 \n" - "shl v4.16b, v7.16b, #6 \n" - "shl v5.16b, v7.16b, #4 \n" - "shl v6.16b, v7.16b, #2 \n" - "subs %2, %2, #80 \n" - "zip1 v16.16b, v4.16b, v0.16b \n" - "zip1 v18.16b, v5.16b, v1.16b \n" - "zip1 v20.16b, v6.16b, v2.16b \n" - "zip1 v22.16b, v7.16b, v3.16b \n" - "zip2 v17.16b, v4.16b, v0.16b \n" - "zip2 v19.16b, v5.16b, v1.16b \n" - "zip2 v21.16b, v6.16b, v2.16b \n" - "zip2 v23.16b, v7.16b, v3.16b \n" - "sri v16.8h, v16.8h, #10 \n" - "sri v17.8h, v17.8h, #10 \n" - "sri v18.8h, v18.8h, #10 \n" - "sri v19.8h, v19.8h, #10 \n" - "st1 {v16.8h-v19.8h}, [%1], #64 \n" - "sri v20.8h, v20.8h, #10 \n" - "sri v21.8h, v21.8h, #10 \n" - "sri v22.8h, v22.8h, #10 \n" - "sri v23.8h, v23.8h, #10 \n" - "st1 {v20.8h-v23.8h}, [%1], #64 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(size) // %2 - : - : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); -} - -#if defined(LIBYUV_USE_ST2) -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - asm volatile ( - "dup v2.8h, %w4 \n" - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "ushl v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v2.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "v0", "v1", "v2"); -} -#else -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - "zip1 v2.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "zip2 v3.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - asm volatile ( - "dup v4.8h, %w4 \n" - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "ushl v0.8h, v0.8h, v4.8h \n" - "ushl v1.8h, v1.8h, v4.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "zip1 v2.8h, v0.8h, v1.8h \n" - "zip2 v3.8h, v0.8h, v1.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4"); -} -#endif // LIBYUV_USE_ST2 - -// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. -void SplitRGBRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile ( - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store R - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%3], #16 \n" // store B - "b.gt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time -void MergeRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. -void SplitARGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w5, %w5, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%3], #16 \n" // store B - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%1], #16 \n" // store R - "st1 {v3.16b}, [%4], #16 \n" // store A - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -#if defined(LIBYUV_USE_ST4) -// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%2], #16 \n" // load B - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%0], #16 \n" // load R - "ld1 {v3.16b}, [%3], #16 \n" // load A - "subs %w5, %w5, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "prfm pldl1keep, [%3, 448] \n" - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} -#else -// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%2], #16 \n" // load B - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%0], #16 \n" // load R - "ld1 {v3.16b}, [%3], #16 \n" // load A - "subs %w5, %w5, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%2, 448] \n" - "zip1 v4.16b, v0.16b, v1.16b \n" // BG - "zip1 v5.16b, v2.16b, v3.16b \n" // RA - "prfm pldl1keep, [%1, 448] \n" - "zip2 v6.16b, v0.16b, v1.16b \n" // BG - "zip2 v7.16b, v2.16b, v3.16b \n" // RA - "prfm pldl1keep, [%0, 448] \n" - "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA - "zip2 v1.8h, v4.8h, v5.8h \n" - "prfm pldl1keep, [%3, 448] \n" - "zip1 v2.8h, v6.8h, v7.8h \n" - "zip2 v3.8h, v6.8h, v7.8h \n" - "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7" // Clobber List - ); -} -#endif // LIBYUV_USE_ST4 - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. -void SplitXRGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%3], #16 \n" // store B - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%1], #16 \n" // store R - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time -void MergeXRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile ( - "movi v3.16b, #255 \n" // load A(255) - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v0.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void MergeXR30Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - int shift = 10 - depth; - asm volatile ( - "movi v30.16b, #255 \n" - "ushr v30.4s, v30.4s, #22 \n" // 1023 - "dup v31.4s, %w5 \n" - "1: \n" - "ldr d2, [%2], #8 \n" // B - "ldr d1, [%1], #8 \n" // G - "ldr d0, [%0], #8 \n" // R - "ushll v2.4s, v2.4h, #0 \n" // B - "ushll v1.4s, v1.4h, #0 \n" // G - "ushll v0.4s, v0.4h, #0 \n" // R - "ushl v2.4s, v2.4s, v31.4s \n" // 000B - "ushl v1.4s, v1.4s, v31.4s \n" // G - "ushl v0.4s, v0.4s, v31.4s \n" // R - "umin v2.4s, v2.4s, v30.4s \n" - "umin v1.4s, v1.4s, v30.4s \n" - "umin v0.4s, v0.4s, v30.4s \n" - "sli v2.4s, v1.4s, #10 \n" // 00GB - "sli v2.4s, v0.4s, #20 \n" // 0RGB - "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) - "subs %w4, %w4, #4 \n" - "str q2, [%3], #16 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); -} - -void MergeXR30Row_10_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int /* depth */, - int width) { - // Neon has no "shift left and accumulate/orr", so use a multiply-add to - // perform the shift instead. - int limit = 1023; - asm volatile ( - "dup v5.8h, %w[limit] \n" - "movi v6.8h, #16 \n" // 1 << 4 - "movi v7.8h, #4, lsl #8 \n" // 1 << 10 - "1: \n" - "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr - "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg - "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb - "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr - "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg - "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000 - "umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb - "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000 - "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb - "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg - "subs %w4, %w4, #8 \n" - "st2 {v3.8h, v4.8h}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : [limit] "r"(limit) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -void MergeAR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile ( - - "dup v30.8h, %w7 \n" - "dup v31.8h, %w6 \n" - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "ldr q3, [%3], #16 \n" // A - "umin v2.8h, v2.8h, v30.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "umin v1.8h, v1.8h, v30.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "umin v0.8h, v0.8h, v30.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "umin v3.8h, v3.8h, v30.8h \n" - "prfm pldl1keep, [%3, 448] \n" - "ushl v2.8h, v2.8h, v31.8h \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "ushl v3.8h, v3.8h, v31.8h \n" - "subs %w5, %w5, #8 \n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_ar64), // %4 - "+r"(width) // %5 - : "r"(shift), // %6 - "r"(mask) // %7 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeXR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile ( - - "movi v3.16b, #0xff \n" // A (0xffff) - "dup v30.8h, %w6 \n" - "dup v31.8h, %w5 \n" - - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "umin v2.8h, v2.8h, v30.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "umin v1.8h, v1.8h, v30.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "umin v0.8h, v0.8h, v30.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "ushl v2.8h, v2.8h, v31.8h \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "subs %w4, %w4, #8 \n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar64), // %3 - "+r"(width) // %4 - : "r"(shift), // %5 - "r"(mask) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeARGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - // Shift is 8 - depth, +8 so the result is in the top half of each lane. - int shift = 16 - depth; - asm volatile ( - "dup v31.8h, %w6 \n" - "1: \n" - "ldr q0, [%0], #16 \n" // B - "ldr q1, [%1], #16 \n" // G - "ldr q2, [%2], #16 \n" // R - "ldr q3, [%3], #16 \n" // A - "uqshl v0.8h, v0.8h, v31.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "uqshl v1.8h, v1.8h, v31.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "uqshl v2.8h, v2.8h, v31.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "uqshl v3.8h, v3.8h, v31.8h \n" - "prfm pldl1keep, [%3, 448] \n" - "trn2 v0.16b, v0.16b, v1.16b \n" - "trn2 v1.16b, v2.16b, v3.16b \n" - "subs %w5, %w5, #8 \n" - "st2 {v0.8h, v1.8h}, [%4], #32 \n" - "b.gt 1b \n" - : "+r"(src_b), // %0 - "+r"(src_g), // %1 - "+r"(src_r), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : "r"(shift) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeXRGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - // Shift is 8 - depth, +8 so the result is in the top half of each lane. - int shift = 16 - depth; - asm volatile ( - "dup v31.8h, %w5 \n" - "movi v3.16b, #0xff \n" // A (0xff) - "1: \n" - "ldr q0, [%0], #16 \n" // B - "ldr q1, [%1], #16 \n" // G - "ldr q2, [%2], #16 \n" // R - "uqshl v0.8h, v0.8h, v31.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "uqshl v1.8h, v1.8h, v31.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "uqshl v2.8h, v2.8h, v31.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "trn2 v0.16b, v0.16b, v1.16b \n" - "trn2 v1.16b, v2.16b, v3.16b \n" - "subs %w4, %w4, #8 \n" - "st2 {v0.8h, v1.8h}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_b), // %0 - "+r"(src_g), // %1 - "+r"(src_r), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -// Copy multiple of 32. -void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #32 \n" // 32 processed per loop - "stp q0, q1, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// SetRow writes 'width' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile ( - "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v8) // %2 - : "cc", "memory", "v0"); -} - -void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile ( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v32) // %2 - : "cc", "memory", "v0"); -} - -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - // Start at end of source row. - "ld1 {v3.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q2, [%0, 16] \n" - "ldr q1, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #32 \n" // 32 pixels per loop. - "tbl v0.16b, {v2.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirror) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Shuffle table for reversing the UV. -static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, - 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; - -void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile ( - // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirrorUV) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void MirrorSplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - // Start at end of source row. - "ld1 {v4.16b}, [%4] \n" // shuffler - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w3, %w3, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "uzp1 v0.16b, v2.16b, v3.16b \n" // U - "uzp2 v1.16b, v2.16b, v3.16b \n" // V - "st1 {v0.16b}, [%1], #16 \n" // dst += 16 - "st1 {v1.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(&kShuffleMirrorUV) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -// Shuffle table for reversing the ARGB. -static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, - 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; - -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( - // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #8 \n" // 8 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirrorARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - asm volatile ( - "ld1 {v3.16b}, [%4] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #48 \n" - - "1: \n" - "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v0.16b, {v0.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "tbl v2.16b, {v2.16b}, v3.16b \n" - "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-48), // %3 - "r"(&kShuffleMirror) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile ( - "movi v4.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of - // RGB24. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile ( - "movi v5.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "mov v3.8b, v1.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "mov v4.8b, v0.8b \n" // move r - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile ( - "movi v0.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "mov v2.8b, v4.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "mov v1.8b, v5.8b \n" // move r - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "mov v3.8b, v1.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "mov v4.8b, v0.8b \n" // move r - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -#define RGB565TOARGB \ - /* Input: v0/v4.8h: RRRRRGGGGGGBBBBB */ \ - "shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \ - "shrn2 v1.16b, v4.8h, #3 \n" /* G GGGGGGxx */ \ - "uzp2 v2.16b, v0.16b, v4.16b \n" /* R RRRRRxxx */ \ - "uzp1 v0.16b, v0.16b, v4.16b \n" /* B xxxBBBBB */ \ - "sri v1.16b, v1.16b, #6 \n" /* G GGGGGGGG, fill 2 */ \ - "shl v0.16b, v0.16b, #3 \n" /* B BBBBB000 */ \ - "sri v2.16b, v2.16b, #5 \n" /* R RRRRRRRR, fill 3 */ \ - "sri v0.16b, v0.16b, #5 \n" /* R BBBBBBBB, fill 3 */ - -void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.16b, #255 \n" // Alpha - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB - "add %1, %1, #64 \n" - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); -} - -#define ARGB1555TOARGB \ - /* Input: ARRRRRGGGGGBBBBB */ \ - "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ - "uzp1 v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \ - "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ - "uzp2 v3.16b, v0.16b, v4.16b \n" /* Axxxxxxx */ \ - "shrn2 v2.16b, v4.8h, #7 \n" /* RRRRRxxx */ \ - "shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \ - "shrn2 v1.16b, v4.8h, #2 \n" /* GGGGGxxx */ \ - "sshr v3.16b, v3.16b, #7 \n" /* AAAAAAAA */ \ - "sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \ - "sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ \ - "sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ - -// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - /* Input: xRRRRRGGGGGBBBBB */ \ - "uzp1 v29.16b, v0.16b, v3.16b \n" /* xxxBBBBB */ \ - "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ - "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ - "shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \ - "shrn2 v2.16b, v3.8h, #7 \n" /* RRRRRxxx */ \ - "shrn2 v1.16b, v3.8h, #2 \n" /* GGGGGxxx */ \ - \ - "sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ \ - "sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \ - "sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ - -void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - ARGB1555TOARGB - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB - "add %1, %1, #64 \n" - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v29" // Clobber List - ); -} - -#define ARGB4444TOARGB \ - /* Input: v1.8h = AAAARRRR_GGGGBBBB */ \ - "shl v0.16b, v1.16b, #4 \n" /* RRRR0000_BBBB0000 */ \ - "sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \ - "sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */ - -#define ARGB4444TORGB \ - /* Input: v0.8h = xxxxRRRRGGGGBBBB */ \ - "uzp1 v1.16b, v0.16b, v3.16b \n" /* GGGGBBBB */ \ - "shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \ - "shl v0.16b, v1.16b, #4 \n" /* BBBB0000 */ \ - "shrn2 v2.16b, v3.8h, #4 \n" /* RRRRxxxx */ \ - "sri v1.16b, v1.16b, #4 \n" /* GGGGGGGG */ \ - "sri v2.16b, v2.16b, #4 \n" /* RRRRRRRR */ \ - "sri v0.16b, v0.16b, #4 \n" /* BBBBBBBB */ - -void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - asm volatile ( - "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB - "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 8 ARGB. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6}; - -static const uint8_t kABGRToAR30Row_BoxIndices[] = { - 2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13, - 0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15}; -static const uint8_t kARGBToAR30Row_BoxIndices[] = { - 0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13, - 2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15}; - -// ARGB or ABGR as input, reordering based on TBL indices parameter. -static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, - uint8_t* dst_ar30, - int width, - const uint8_t* indices) { - asm volatile ( - "movi v2.4s, #0xf, msl 16 \n" // 0xfffff - "ldr q3, [%[kAR30Row_BoxShifts]] \n" - "ldp q4, q5, [%[indices]] \n" - "1: \n" - "ldp q0, q20, [%[src]], #32 \n" - "subs %w[width], %w[width], #8 \n" - "tbl v1.16b, {v0.16b}, v5.16b \n" - "tbl v21.16b, {v20.16b}, v5.16b \n" - "tbl v0.16b, {v0.16b}, v4.16b \n" - "tbl v20.16b, {v20.16b}, v4.16b \n" - "ushl v0.8h, v0.8h, v3.8h \n" - "ushl v20.8h, v20.8h, v3.8h \n" - "ushl v1.8h, v1.8h, v3.8h \n" - "ushl v21.8h, v21.8h, v3.8h \n" - "ushr v0.4s, v0.4s, #6 \n" - "ushr v20.4s, v20.4s, #6 \n" - "shl v1.4s, v1.4s, #14 \n" - "shl v21.4s, v21.4s, #14 \n" - "bif v0.16b, v1.16b, v2.16b \n" - "bif v20.16b, v21.16b, v2.16b \n" - "stp q0, q20, [%[dst]], #32 \n" - "b.gt 1b \n" - : [src] "+r"(src_abcd), // %[src] - [dst] "+r"(dst_ar30), // %[dst] - [width] "+r"(width) // %[width] - : [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts] - [indices] "r"(indices) // %[indices] - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21"); -} - -void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { - ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices); -} - -void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { - ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices); -} - -void ARGBToRGB24Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "prfm pldl1keep, [%0, 448] \n" - "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile ( - "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "mov v4.8b, v2.8b \n" // mov g - "prfm pldl1keep, [%0, 448] \n" - "mov v5.8b, v1.8b \n" // mov b - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "prfm pldl1keep, [%0, 448] \n" - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "prfm pldl1keep, [%0, 448] \n" - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(src_uyvyb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7" // Clobber List - ); -} - -void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width) { - const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row - "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV - "prfm pldl1keep, [%0, 448] \n" - "st1 {v4.16b}, [%2], #16 \n" // store 8 UV. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile ( - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile ( - "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "subs %w4, %w4, #16 \n" // 16 pixels - "mov v2.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void I422ToUYVYRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile ( - "1: \n" - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "mov v3.8b, v2.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void ARGBToRGB565Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb565, - int width) { - asm volatile ( - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 - "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v16", "v17", "v18", "v19"); -} - -void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - asm volatile ( - "dup v1.4s, %w3 \n" // dither4 - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uqadd v16.8b, v16.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uqadd v17.8b, v17.8b, v1.8b \n" - "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 - "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb), // %1 - "+r"(width) // %2 - : "r"(dither4) // %3 - : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); -} - -void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb1555, - int width) { - asm volatile( - "1: \n" - "ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 - "st1 {v17.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v16", "v17"); -} - -void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb4444, - int width) { - asm volatile ( - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19"); -} - -#if defined(LIBYUV_USE_ST2) -void ARGBToAR64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile ( - "1: \n" - "ldp q0, q2, [%0], #32 \n" // load 8 pixels - "mov v1.16b, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels - "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - -void ARGBToAB64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile ( - "ldr q4, [%3] \n" // shuffler - "1: \n" - "ldp q0, q2, [%0], #32 \n" // load 8 pixels - "tbl v0.16b, {v0.16b}, v4.16b \n" - "tbl v2.16b, {v2.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "mov v1.16b, v0.16b \n" - "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels - "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "r"(&kShuffleARGBToABGR) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} -#else -void ARGBToAR64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile ( - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "zip1 v2.16b, v0.16b, v0.16b \n" - "zip2 v3.16b, v0.16b, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "zip1 v4.16b, v1.16b, v1.16b \n" - "zip2 v5.16b, v1.16b, v1.16b \n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); -} - -static const uvec8 kShuffleARGBToAB64[2] = { - {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}, - {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}}; - -void ARGBToAB64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile ( - "ldp q6, q7, [%3] \n" // 2 shufflers - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 - "tbl v3.16b, {v0.16b}, v7.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v4.16b, {v1.16b}, v6.16b \n" - "tbl v5.16b, {v1.16b}, v7.16b \n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "r"(&kShuffleARGBToAB64[0]) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} -#endif // LIBYUV_USE_ST2 - -static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, - 17, 19, 21, 23, 25, 27, 29, 31}; - -void AR64ToARGBRow_NEON(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile ( - "ldr q4, [%3] \n" // shuffler - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 4 pixels - "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "stp q0, q2, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(&kShuffleAR64ToARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, - 21, 19, 17, 23, 29, 27, 25, 31}; - -void AB64ToARGBRow_NEON(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile ( - "ldr q4, [%3] \n" // shuffler - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 4 pixels - "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "stp q0, q2, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(&kShuffleAB64ToARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -struct RgbUVConstantsU8 { - uint8_t kRGBToU[4]; - uint8_t kRGBToV[4]; -}; - -struct RgbUVConstantsI8 { - int8_t kRGBToU[4]; - int8_t kRGBToV[4]; -}; - -// 8x1 pixels. -static void ARGBToUV444MatrixRow_NEON( - const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstantsU8* rgbuvconstants) { - asm volatile( - "ldr d0, [%4] \n" // load rgbuvconstants - "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient - "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient - "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient - "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient - "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient - "movi v29.16b, #0x80 \n" // 128.5 - - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "prfm pldl1keep, [%0, 448] \n" - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - - "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned - "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned - - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(rgbuvconstants) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", - "v27", "v28", "v29"); -} - -static void ARGBToUV444MatrixRow_NEON_I8MM( - const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstantsI8* rgbuvconstants) { - asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" - "movi v29.16b, #0x80 \n" // 128.5 - "1: \n" - "ldp q0, q1, [%[src]], #32 \n" - "movi v2.4s, #0 \n" - "movi v3.4s, #0 \n" - "movi v4.4s, #0 \n" - "movi v5.4s, #0 \n" - "usdot v2.4s, v0.16b, v16.16b \n" - "usdot v3.4s, v1.16b, v16.16b \n" - "usdot v4.4s, v0.16b, v17.16b \n" - "usdot v5.4s, v1.16b, v17.16b \n" - "prfm pldl1keep, [%[src], 448] \n" - "subs %w[width], %w[width], #8 \n" // 8 processed per loop. - "uzp1 v0.8h, v2.8h, v3.8h \n" - "uzp1 v1.8h, v4.8h, v5.8h \n" - "addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned - "addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned - "str d0, [%[dst_u]], #8 \n" // store 8 pixels U. - "str d1, [%[dst_v]], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : [src] "+r"(src_argb), // %[src] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width) // %[width] - : [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants] - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", - "v29"); -} - -// RGB to bt601 coefficients -// UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = 74 -// UR -0.2969 coefficient = 38 -// VB -0.1406 coefficient = 18 -// VG -0.7344 coefficient = 94 -// VR 0.875 coefficient = 112 (ignored) - -static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = { - {112, 74, 38, 0}, - {18, 94, 112, 0}}; -static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = { - {112, -74, -38, 0}, - {-18, -94, 112, 0}}; - -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstantsU8); -} - -void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstantsI8); -} - -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -// clang-format off -#define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */ -// clang-format on - -// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -// TODO(fbarchard): consider ptrdiff_t for all strides. - -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -// TODO(fbarchard): Subsample match Intel code. -void ARGBToUVJRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void ABGRToUVJRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_uj, - uint8_t* dst_vj, - int width) { - const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; - asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_abgr_1), // %1 - "+r"(dst_uj), // %2 - "+r"(dst_vj), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; - asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_rgb24_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RAWToUVJRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_raw_1 = src_raw + src_stride_raw; - asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_raw_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void BGRAToUVRow_NEON(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(src_bgra_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void ABGRToUVRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v2.8h, v1.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_abgr_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RGBAToUVRow_NEON(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(src_rgba_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_rgb24_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RAWToUVRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_raw_1 = src_raw + src_stride_raw; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_raw_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile( - RGBTOUV_SETUP_REG - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. - RGB565TOARGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ldp q0, q4, [%1], #32 \n" // load 16 RGB565 pixels. - RGB565TOARGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile( - RGBTOUV_SETUP_REG - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ldp q0, q3, [%1], #32 \n" // load 16 ARGB1555 pixels. - RGB555TOARGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_argb1555_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28", "v29"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile( - RGBTOUV_SETUP_REG // sets v20-v25 - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. - ARGB4444TORGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ldp q0, q3, [%1], #32 \n" // load 16 ARGB4444 pixels. - ARGB4444TORGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_argb4444_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28" - - ); -} - -void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile( - "movi v24.16b, #25 \n" // B * 0.1016 coefficient - "movi v25.16b, #129 \n" // G * 0.5078 coefficient - "movi v26.16b, #66 \n" // R * 0.2578 coefficient - "movi v27.16b, #16 \n" // Add 16 constant - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umull2 v4.8h, v0.16b, v24.16b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal2 v4.8h, v1.16b, v25.16b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "umlal2 v4.8h, v2.16b, v26.16b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqrshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "uqadd v1.8b, v1.8b, v27.8b \n" - "stp d0, d1, [%1], #16 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", - "v27"); -} - -void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - asm volatile( - "movi v4.16b, #25 \n" // B * 0.1016 coefficient - "movi v5.16b, #129 \n" // G * 0.5078 coefficient - "movi v6.16b, #66 \n" // R * 0.2578 coefficient - "movi v7.16b, #16 \n" // Add 16 constant - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - RGB555TOARGB - "umull v16.8h, v0.8b, v4.8b \n" // B - "umull2 v17.8h, v0.16b, v4.16b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal2 v17.8h, v1.16b, v5.16b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "umlal2 v17.8h, v2.16b, v6.16b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.16b, v0.16b, v7.16b \n" - "str q0, [%1], #16 \n" // store pixels Y. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v29"); -} - -void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - asm volatile( - "movi v24.16b, #25 \n" // B * 0.1016 coefficient - "movi v25.16b, #129 \n" // G * 0.5078 coefficient - "movi v26.16b, #66 \n" // R * 0.2578 coefficient - "movi v27.16b, #16 \n" // Add 16 constant - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - ARGB4444TORGB - "umull v16.8h, v0.8b, v24.8b \n" // B - "umull2 v17.8h, v0.16b, v24.16b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v25.8b \n" // G - "umlal2 v17.8h, v1.16b, v25.16b \n" // G - "umlal v16.8h, v2.8b, v26.8b \n" // R - "umlal2 v17.8h, v2.16b, v26.16b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.16b, v0.16b, v27.16b \n" - "str q0, [%1], #16 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); -} - -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; -}; - -// ARGB expects first 3 values to contain RGB and 4th value is ignored. -static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "ldr d0, [%3] \n" // load rgbconstants - "dup v6.16b, v0.b[0] \n" - "dup v7.16b, v0.b[1] \n" - "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v0.h[2] \n" - "1: \n" - "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 - // pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "umull v0.8h, v2.8b, v6.8b \n" // B - "umull2 v1.8h, v2.16b, v6.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v3.8b, v7.8b \n" // G - "umlal2 v1.8h, v3.16b, v7.16b \n" - "umlal v0.8h, v4.8b, v16.8b \n" // R - "umlal2 v1.8h, v4.16b, v16.16b \n" - "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y - "addhn v1.8b, v1.8h, v17.8h \n" - "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17"); -} - -static void ARGBToYMatrixRow_NEON_DotProd( - const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "ldr d0, [%3] \n" // load rgbconstants - "dup v16.4s, v0.s[0] \n" - "dup v17.8h, v0.h[2] \n" - "1: \n" - "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 - // pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "movi v0.16b, #0 \n" - "movi v1.16b, #0 \n" - "movi v2.16b, #0 \n" - "movi v3.16b, #0 \n" - "udot v0.4s, v4.16b, v16.16b \n" - "udot v1.4s, v5.16b, v16.16b \n" - "udot v2.4s, v6.16b, v16.16b \n" - "udot v3.4s, v7.16b, v16.16b \n" - "uzp1 v0.8h, v0.8h, v1.8h \n" - "uzp1 v1.8h, v2.8h, v3.8h \n" - "addhn v0.8b, v0.8h, v17.8h \n" - "addhn v1.8b, v1.8h, v17.8h \n" - "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17"); -} - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; -static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, - 128}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080}; -static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, - 0x1080}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; -static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, - 0x1080}; - -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); -} - -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} - -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); -} - -void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); -} - -void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants); -} - -void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_yj, - int width) { - ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} - -void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, - uint8_t* dst_y, - int width) { - ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants); -} - -void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, - uint8_t* dst_yj, - int width) { - ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants); -} - -// RGBA expects first value to be A and ignored, then 3 values to contain RGB. -// Same code as ARGB, except the LD4 -static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "ldr d0, [%3] \n" // load rgbconstants - "dup v6.16b, v0.b[0] \n" - "dup v7.16b, v0.b[1] \n" - "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v0.h[2] \n" - "1: \n" - "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 - // pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "umull v0.8h, v2.8b, v6.8b \n" // B - "umull2 v1.8h, v2.16b, v6.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v3.8b, v7.8b \n" // G - "umlal2 v1.8h, v3.16b, v7.16b \n" - "umlal v0.8h, v4.8b, v16.8b \n" // R - "umlal2 v1.8h, v4.16b, v16.16b \n" - "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y - "addhn v1.8b, v1.8h, v17.8h \n" - "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17"); -} - -void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); -} - -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); -} - -void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, - uint8_t* dst_y, - int width) { - // No need for a separate implementation for RGBA inputs, just permute the - // RGB constants. - ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, - &kRgb24I601DotProdConstants); -} - -void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba, - uint8_t* dst_yj, - int width) { - // No need for a separate implementation for RGBA inputs, just permute the - // RGB constants. - ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, - &kRgb24JPEGDotProdConstants); -} - -void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, - uint8_t* dst_y, - int width) { - // No need for a separate implementation for RGBA inputs, just permute the - // RGB constants. - ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, - &kRawI601DotProdConstants); -} - -static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - asm volatile ( - "ldr d0, [%3] \n" // load rgbconstants - "dup v5.16b, v0.b[0] \n" - "dup v6.16b, v0.b[1] \n" - "dup v7.16b, v0.b[2] \n" - "dup v16.8h, v0.h[2] \n" - "1: \n" - "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "umull v0.8h, v2.8b, v5.8b \n" // B - "umull2 v1.8h, v2.16b, v5.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v3.8b, v6.8b \n" // G - "umlal2 v1.8h, v3.16b, v6.16b \n" - "umlal v0.8h, v4.8b, v7.8b \n" // R - "umlal2 v1.8h, v4.16b, v7.16b \n" - "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y - "addhn v1.8b, v1.8h, v16.8h \n" - "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); -} - -void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); -} - -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); -} - -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); -} - -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction), // %4 - "+r"(y0_fraction) // %5 - : - : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_16_NEON(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - - "dup v5.8h, %w4 \n" - "dup v4.8h, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.8h}, [%1], #16 \n" - "ld1 {v1.8h}, [%2], #16 \n" - "subs %w3, %w3, #8 \n" - "umull v2.4s, v0.4h, v4.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "umull2 v3.4s, v0.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "umlal v2.4s, v1.4h, v5.4h \n" - "umlal2 v3.4s, v1.8h, v5.8h \n" - "rshrn v0.4h, v2.4s, #8 \n" - "rshrn2 v0.8h, v3.4s, #8 \n" - "st1 {v0.8h}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.8h}, [%1], #16 \n" - "ld1 {v1.8h}, [%2], #16 \n" - "subs %w3, %w3, #8 \n" - "prfm pldl1keep, [%1, 448] \n" - "urhadd v0.8h, v0.8h, v1.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.8h}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.8h}, [%1], #16 \n" - "subs %w3, %w3, #8 \n" - "prfm pldl1keep, [%1, 448] \n" - "st1 {v0.8h}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width) // %3 - : "r"(y1_fraction), // %4 - "r"(y0_fraction) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); -} - -// Bilinear filter 8x2 -> 8x1 -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits -void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int scale, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - - asm volatile ( - "dup v6.8h, %w6 \n" - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - - "dup v5.8h, %w4 \n" - "dup v4.8h, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.8h}, [%1], #16 \n" - "ld1 {v1.8h}, [%2], #16 \n" - "subs %w3, %w3, #8 \n" - "umull v2.4s, v0.4h, v4.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "umull2 v3.4s, v0.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "umlal v2.4s, v1.4h, v5.4h \n" - "umlal2 v3.4s, v1.8h, v5.8h \n" - "rshrn v0.4h, v2.4s, #8 \n" - "rshrn2 v0.8h, v3.4s, #8 \n" - "ushl v0.8h, v0.8h, v6.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%0], #8 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.8h}, [%1], #16 \n" - "ld1 {v1.8h}, [%2], #16 \n" - "subs %w3, %w3, #8 \n" - "prfm pldl1keep, [%1, 448] \n" - "urhadd v0.8h, v0.8h, v1.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "ushl v0.8h, v0.8h, v6.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%0], #8 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ldr q0, [%1], #16 \n" - "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative - "prfm pldl1keep, [%1, 448] \n" - "uqxtn v0.8b, v0.8h \n" - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "str d0, [%0], #8 \n" // store 8 pixels - "b.gt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width) // %3 - : "r"(y1_fraction), // %4 - "r"(y0_fraction), // %5 - "r"(shift) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" - // Blend 8 pixels. - "8: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "prfm pldl1keep, [%0, 448] \n" - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "prfm pldl1keep, [%1, 448] \n" - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels - "b.ge 8b \n" - - "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" - - // Blend 1 pixels. - "1: \n" - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel - // ARGB0. - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel - // ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "prfm pldl1keep, [%0, 448] \n" - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "prfm pldl1keep, [%1, 448] \n" - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" - - "99: \n" - - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18"); -} - -// Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile ( - "movi v7.8h, #0x00ff \n" // 255 for rounding up - - // Attenuate 8 pixels. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "prfm pldl1keep, [%0, 448] \n" - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8 - "addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8 - "addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Quantize 8 ARGB pixels (32 bytes). -// dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile ( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add - - // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "prfm pldl1keep, [%0, 448] \n" - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// Shade 8 pixels at a time by specified value. -// sqrdmulh is a rounding instruction, so +1 if high bit of low half of -// multiply result is set. -void ARGBShadeRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile ( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.8h \n" // argb * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.8h \n" - "sqrdmulh v6.8h, v6.8h, v0.8h \n" - "sqrdmulh v7.8h, v7.8h, v0.8h \n" - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - "st1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); -} - -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -// Similar to ARGBToYJ but stores ARGB. -// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; -void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( - "movi v24.8b, #29 \n" // B * 0.1140 coefficient - "movi v25.8b, #150 \n" // G * 0.5870 coefficient - "movi v26.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B - "mov v1.8b, v0.8b \n" // G - "mov v2.8b, v0.8b \n" // R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); -} - -static const uvec8 kARGBGrayRowCoeffs = {29, 150, 77, 0}; -static const uvec8 kARGBGrayRowIndices = {0, 0, 0, 19, 2, 2, 2, 23, - 4, 4, 4, 27, 6, 6, 6, 31}; - -void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - "ld1r {v24.4s}, [%[coeffs]] \n" - "ldr q25, [%[indices]] \n" - "1: \n" - "ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB - "movi v0.4s, #0 \n" - "movi v2.4s, #0 \n" - "subs %w[width], %w[width], #8 \n" // 8 processed per loop - "udot v0.4s, v1.16b, v24.16b \n" - "udot v2.4s, v3.16b, v24.16b \n" - "prfm pldl1keep, [%[src], 448] \n" - "uqrshrn v0.8b, v0.8h, #8 \n" - "uqrshrn v2.8b, v2.8h, #8 \n" - "tbl v0.16b, {v0.16b, v1.16b}, v25.16b \n" // merge in alpha - "tbl v1.16b, {v2.16b, v3.16b}, v25.16b \n" - "stp q0, q1, [%[dst]], #32 \n" // store 8 pixels - "b.gt 1b \n" - : [src] "+r"(src_argb), // %[src] - [dst] "+r"(dst_argb), // %[dst] - [width] "+r"(width) // %[width] - : [coeffs] "r"(&kARGBGrayRowCoeffs), // %[coeffs] - [indices] "r"(&kARGBGrayRowIndices) // %[indices] - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25"); -} - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 - -void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile ( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "prfm pldl1keep, [%0, 448] \n" - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); -} - -static const uvec8 kARGBSepiaRowCoeffs = {17, 68, 35, 0, 22, 88, - 45, 0, 24, 98, 50, 0}; -static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31}; - -void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) { - asm volatile( - "ld3r {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n" - "ldr d23, [%[indices]] \n" - "1: \n" - "ldp q0, q1, [%[dst]] \n" - "movi v2.4s, #0 \n" - "movi v3.4s, #0 \n" - "movi v4.4s, #0 \n" - "movi v5.4s, #0 \n" - "movi v6.4s, #0 \n" - "movi v7.4s, #0 \n" - "udot v2.4s, v0.16b, v20.16b \n" - "udot v3.4s, v1.16b, v20.16b \n" - "udot v4.4s, v0.16b, v21.16b \n" - "udot v5.4s, v1.16b, v21.16b \n" - "udot v6.4s, v0.16b, v22.16b \n" - "udot v7.4s, v1.16b, v22.16b \n" - "subs %w1, %w1, #8 \n" - "prfm pldl1keep, [%[dst], 448] \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "uzp1 v5.8h, v4.8h, v5.8h \n" - "uzp1 v4.8h, v2.8h, v3.8h \n" - "tbl v3.16b, {v0.16b, v1.16b}, v23.16b \n" - "uqshrn v0.8b, v4.8h, #7 \n" - "uqshrn v1.8b, v5.8h, #7 \n" - "uqshrn v2.8b, v6.8h, #7 \n" - "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n" - "b.gt 1b \n" - : [dst] "+r"(dst_argb), // %[dst] - [width] "+r"(width) // %[width] - : [coeffs] "r"(&kARGBSepiaRowCoeffs), // %[coeffs] - [indices] "r"(&kARGBSepiaRowAlphaIndices) // %[indices] - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); -} - -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// TODO(fbarchard): Was same as Sepia except matrix is provided. This function -// needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile ( - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "prfm pldl1keep, [%0, 448] \n" - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v22", "v23", "v24", "v25"); -} - -void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile ( - "ld1 {v31.16b}, [%[matrix_argb]] \n" - - "1: \n" - "ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n" - - "movi v16.4s, #0 \n" - "movi v17.4s, #0 \n" - "movi v18.4s, #0 \n" - "movi v19.4s, #0 \n" - "movi v20.4s, #0 \n" - "movi v21.4s, #0 \n" - "movi v22.4s, #0 \n" - "movi v23.4s, #0 \n" - - // 8 processed per loop. - "subs %w2, %w2, #8 \n" - "prfm pldl1keep, [%[src_argb], 448] \n" - - "sudot v16.4s, v31.16b, v0.4b[0] \n" - "sudot v17.4s, v31.16b, v0.4b[1] \n" - "sudot v18.4s, v31.16b, v0.4b[2] \n" - "sudot v19.4s, v31.16b, v0.4b[3] \n" - "sudot v20.4s, v31.16b, v1.4b[0] \n" - "sudot v21.4s, v31.16b, v1.4b[1] \n" - "sudot v22.4s, v31.16b, v1.4b[2] \n" - "sudot v23.4s, v31.16b, v1.4b[3] \n" - - "shrn v16.4h, v16.4s, #6 \n" - "shrn v18.4h, v18.4s, #6 \n" - "shrn v20.4h, v20.4s, #6 \n" - "shrn v22.4h, v22.4s, #6 \n" - "shrn2 v16.8h, v17.4s, #6 \n" - "shrn2 v18.8h, v19.4s, #6 \n" - "shrn2 v20.8h, v21.4s, #6 \n" - "shrn2 v22.8h, v23.4s, #6 \n" - - "uqxtn v16.8b, v16.8h \n" - "uqxtn v18.8b, v18.8h \n" - "uqxtn v20.8b, v20.8h \n" - "uqxtn v22.8b, v22.8h \n" - - "stp d16, d18, [%[dst_argb]], #16 \n" - "stp d20, d22, [%[dst_argb]], #16 \n" - "b.gt 1b \n" - : [src_argb] "+r"(src_argb), // %[src_argb] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [matrix_argb] "r"(matrix_argb) // %[matrix_argb] - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v31"); -} - -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "prfm pldl1keep, [%0, 448] \n" - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "prfm pldl1keep, [%1, 448] \n" - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 8 ARGB - "ldp q4, q5, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "uqadd v0.16b, v0.16b, v4.16b \n" - "uqadd v1.16b, v1.16b, v5.16b \n" - "stp q0, q1, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 8 ARGB - "ldp q4, q5, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "uqsub v0.16b, v0.16b, v4.16b \n" - "uqsub v1.16b, v1.16b, v5.16b \n" - "stp q0, q1, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "prfm pldl1keep, [%0, 448] \n" - "mov v1.8b, v0.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "mov v2.8b, v0.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "uqadd v0.16b, v0.16b, v1.16b \n" // add - "prfm pldl1keep, [%1, 448] \n" - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1"); -} - -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "uqadd v1.8b, v0.8b, v2.8b \n" // add - "prfm pldl1keep, [%1, 448] \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.8b}, [%0],%5 \n" // top - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%2],%5 \n" // bottom - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "prfm pldl1keep, [%2, 448] \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2LL), // %5 - "r"(6LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile ( - "1: \n" - "ld1 {v0.8b}, [%0],%4 \n" // left - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%0],%5 \n" // right - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "add v0.8h, v0.8h, v1.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1LL), // %4 - "r"(6LL) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Caveat - rounds float to half float whereas scaling version truncates. -void HalfFloat1Row_NEON(const uint16_t* src, - uint16_t* dst, - float /*unused*/, - int width) { - asm volatile ( - "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3"); -} - -void HalfFloatRow_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - asm volatile ( - "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v3"); -} - -void ByteToFloatRow_NEON(const uint8_t* src, - float* dst, - float scale, - int width) { - asm volatile ( - "1: \n" - "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v1.8h, v1.8b \n" // 8 shorts - "prfm pldl1keep, [%0, 448] \n" - "uxtl v2.4s, v1.4h \n" // 8 ints - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "fmul v3.4s, v3.4s, %3.s[0] \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "v1", "v2", "v3"); -} - -// Convert FP16 Half Floats to FP32 Floats -void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 - float* dst, - int width) { - asm volatile ( - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats - "subs %w2, %w2, #8 \n" // 8 floats per loop - "prfm pldl1keep, [%0, 448] \n" - "fcvtl v2.4s, v1.4h \n" // 8 floats - "fcvtl2 v3.4s, v1.8h \n" - "stp q2, q3, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3"); -} - -// Convert FP16 Half Floats to FP32 Floats -// Read a column and write a row -void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 - int src_stride, // stride in elements - float* dst, - int width) { - asm volatile ( - "cmp %w2, #8 \n" // Is there 8 rows? - "b.lo 2f \n" - "1: \n" - "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats - "ld1 {v0.h}[1], [%0], %3 \n" - "ld1 {v0.h}[2], [%0], %3 \n" - "ld1 {v0.h}[3], [%0], %3 \n" - "ld1 {v1.h}[0], [%0], %3 \n" - "ld1 {v1.h}[1], [%0], %3 \n" - "ld1 {v1.h}[2], [%0], %3 \n" - "ld1 {v1.h}[3], [%0], %3 \n" - "subs %w2, %w2, #8 \n" // 8 rows per loop - "prfm pldl1keep, [%0, 448] \n" - "fcvtl v2.4s, v0.4h \n" // 4 floats - "fcvtl v3.4s, v1.4h \n" // 4 more floats - "stp q2, q3, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" - "cmp %w2, #1 \n" // Is there 1 value? - "b.lo 3f \n" - "2: \n" - "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats - "subs %w2, %w2, #1 \n" // 1 floats per loop - "fcvtl v2.4s, v1.4h \n" // 1 floats - "str s2, [%1], #4 \n" // store 1 floats - "b.gt 2b \n" - "3: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride * 2)) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Convert FP32 Floats to FP16 Half Floats -void ConvertFP32ToFP16Row_NEON(const float* src, - uint16_t* dst, // fp16 - int width) { - asm volatile ( - "1: \n" - "ldp q2, q3, [%0], #32 \n" // load 8 floats - "subs %w2, %w2, #8 \n" // 8 floats per loop - "prfm pldl1keep, [%0, 448] \n" - "fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats - "fcvtn2 v1.8h, v3.4s \n" - "str q1, [%1], #16 \n" // store 8 fp16 halffloats - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3"); -} - -float ScaleMaxSamples_NEON(const float* src, - float* dst, - float scale, - int width) { - float fmax; - asm volatile ( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "prfm pldl1keep, [%0, 448] \n" - "fmul v4.4s, v2.4s, %4.s[0] \n" // scale - "fmax v5.4s, v5.4s, v1.4s \n" // max - "fmax v6.4s, v6.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "fmax v5.4s, v5.4s, v6.4s \n" // max - "fmaxv %s3, v5.4s \n" // signed max acculator - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fmax) // %3 - : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); - return fmax; -} - -float ScaleSumSamples_NEON(const float* src, - float* dst, - float scale, - int width) { - float fsum; - asm volatile ( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" // max - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "prfm pldl1keep, [%0, 448] \n" - "fmul v4.4s, v2.4s, %4.s[0] \n" - "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares - "fmla v6.4s, v2.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "faddp v5.4s, v5.4s, v6.4s \n" - "faddp v5.4s, v5.4s, v5.4s \n" - "faddp %3.4s, v5.4s, v5.4s \n" // sum - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fsum) // %3 - : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); - return fsum; -} - -void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { - asm volatile ( - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %3.s[0] \n" // scale - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "v1", "v2"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_NEON(const uint16_t* src0, - const uint16_t* src1, - const uint16_t* src2, - const uint16_t* src3, - const uint16_t* src4, - uint32_t* dst, - int width) { - asm volatile ( - "movi v6.8h, #4 \n" // constant 4 - "movi v7.8h, #6 \n" // constant 6 - - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows - "ld1 {v2.8h}, [%4], #16 \n" - "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 - "prfm pldl1keep, [%0, 448] \n" - "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 - "ld1 {v2.8h}, [%1], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%1, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "ld1 {v2.8h}, [%2], #16 \n" - "umlal v0.4s, v2.4h, v7.4h \n" // * 6 - "prfm pldl1keep, [%2, 448] \n" - "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 - "ld1 {v2.8h}, [%3], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%3, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples - "prfm pldl1keep, [%4, 448] \n" - "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : - : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { - const uint32_t* src1 = src + 1; - const uint32_t* src2 = src + 2; - const uint32_t* src3 = src + 3; - asm volatile ( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "add v1.4s, v1.4s, v2.4s \n" // * 1 - "ld1 {v2.4s,v3.4s}, [%2], #32 \n" - "mla v0.4s, v2.4s, v7.4s \n" // * 6 - "mla v1.4s, v3.4s, v7.4s \n" // * 6 - "ld1 {v2.4s,v3.4s}, [%1], #32 \n" - "ld1 {v4.4s,v5.4s}, [%3], #32 \n" - "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 - "add v3.4s, v3.4s, v5.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - "uqrshrn2 v0.8h, v1.4s, #8 \n" - "st1 {v0.8h}, [%4], #16 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(width) // %5 - : "r"(32LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_F32_NEON(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, - int width) { - asm volatile ( - "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 - - "1: \n" - "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows - "ld1 {v2.4s, v3.4s}, [%1], #32 \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%2], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%3], #32 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "prfm pldl1keep, [%1, 448] \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%4], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%2, 448] \n" - "fadd v0.4s, v0.4s, v4.4s \n" // * 1 - "prfm pldl1keep, [%3, 448] \n" - "fadd v1.4s, v1.4s, v5.4s \n" - "prfm pldl1keep, [%4, 448] \n" - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : "r"(&kGaussCoefficients) // %7 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_F32_NEON(const float* src, float* dst, int width) { - asm volatile ( - "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 - - "1: \n" - "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 - // rows - "fadd v0.4s, v0.4s, v1.4s \n" // * 1 - "ld1 {v4.4s, v5.4s}, [%0], %5 \n" - "fadd v1.4s, v1.4s, v2.4s \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%0], %4 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "ld1 {v4.4s, v5.4s}, [%0], %6 \n" - "fadd v2.4s, v2.4s, v4.4s \n" - "fadd v3.4s, v3.4s, v5.4s \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmul v0.4s, v0.4s, v8.4s \n" // / 256 - "fmul v1.4s, v1.4s, v8.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kGaussCoefficients), // %3 - "r"(8LL), // %4 - "r"(-4LL), // %5 - "r"(20LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); -} - -#if defined(LIBYUV_USE_ST3) -// Convert biplanar NV21 to packed YUV24 -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile ( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "prfm pldl1keep, [%0, 448] \n" - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2"); -} -#else -static const uvec8 kYUV24Shuffle[3] = { - {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20}, - {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27}, - {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}}; - -// Convert biplanar NV21 to packed YUV24 -// NV21 has VU in memory for chroma. -// YUV24 is VUY in memory -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile ( - "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values - "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values - "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 - "prfm pldl1keep, [%0, 448] \n" - "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : "r"(&kYUV24Shuffle[0]) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} -#endif // LIBYUV_USE_ST3 - -// Note ST2 8b version is faster than zip+ST1 - -// AYUV is VUYA in memory. UV for NV12 is UV order in memory. -void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile ( - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile ( - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Copy row of AYUV Y's into Y -void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values - "ld1 {v1.16b}, [%0], 16 \n" - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "rev16 v0.16b, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "rev16 v1.16b, v1.16b \n" - "stp q0, q1, [%1], 32 \n" // store 16 VU pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1"); -} - -void HalfMergeUVRow_NEON(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - const uint8_t* src_u_1 = src_u + src_stride_u; - const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values - "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values - "ld1 {v2.16b}, [%1], #16 \n" - "ld1 {v3.16b}, [%3], #16 \n" - "uaddlp v0.8h, v0.16b \n" // half size - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v3.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uqrshrn v0.8b, v0.8h, #2 \n" - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w5, %w5, #16 \n" // 16 src pixels per loop - "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_u_1), // %1 - "+r"(src_v), // %2 - "+r"(src_v_1), // %3 - "+r"(dst_uv), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void SplitUVRow_16_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - int shift = depth - 16; // Negative for right shift. - asm volatile ( - "dup v2.8h, %w4 \n" - "1: \n" - "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "ushl v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v2.8h \n" - "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels - "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "v0", "v1", "v2"); -} - -void MultiplyRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile ( - "dup v2.8h, %w3 \n" - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "mul v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "mul v1.8h, v1.8h, v2.8h \n" - "stp q0, q1, [%1], #32 \n" // store 16 pixels - "subs %w2, %w2, #16 \n" // 16 src pixels per loop - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "cc", "memory", "v0", "v1", "v2"); -} - -void DivideRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile ( - "dup v4.8h, %w3 \n" - "1: \n" - "ldp q2, q3, [%0], #32 \n" - "umull v0.4s, v2.4h, v4.4h \n" - "umull2 v1.4s, v2.8h, v4.8h \n" - "umull v2.4s, v3.4h, v4.4h \n" - "umull2 v3.4s, v3.8h, v4.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "uzp2 v0.8h, v0.8h, v1.8h \n" - "uzp2 v1.8h, v2.8h, v3.8h \n" - "stp q0, q1, [%1], #32 \n" // store 16 pixels - "subs %w2, %w2, #16 \n" // 16 src pixels per loop - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits = shr 1 -// 16384 = 10 bits = shr 2 -// 4096 = 12 bits = shr 4 -// 256 = 16 bits = shr 8 -void Convert16To8Row_NEON(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - // 15 - clz(scale), + 8 to shift result into the high half of the lane to - // saturate, then we can just use UZP2 to narrow rather than a pair of - // saturating narrow instructions. - int shift = 23 - __builtin_clz((int32_t)scale); - asm volatile ( - "dup v2.8h, %w3 \n" - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "uqshl v0.8h, v0.8h, v2.8h \n" - "uqshl v1.8h, v1.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "uzp2 v0.16b, v0.16b, v1.16b \n" - "subs %w2, %w2, #16 \n" // 16 src pixels per loop - "str q0, [%1], #16 \n" // store 16 pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(shift) // %3 - : "cc", "memory", "v0", "v1", "v2"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/row_rvv.c b/drivers/media/pci/tbscapture2/row_rvv.c deleted file mode 100644 index 916a108dde19..000000000000 --- a/drivers/media/pci/tbscapture2/row_rvv.c +++ /dev/null @@ -1,2599 +0,0 @@ -/* - * Copyright 2023 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -/* - * Copyright (c) 2023 SiFive, Inc. All rights reserved. - * - * Contributed by Darren Hsieh - * Contributed by Bruce Lai - */ - -#include "row.h" - -// This module is for clang rvv. GCC hasn't supported segment load & store. -#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ - defined(__clang__) -#include -#include - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#ifdef LIBYUV_RVV_HAS_VXRM_ARG -// Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ - { \ - ub = yuvconst->kUVCoeff[0]; \ - vr = yuvconst->kUVCoeff[1]; \ - ug = yuvconst->kUVCoeff[2]; \ - vg = yuvconst->kUVCoeff[3]; \ - yg = yuvconst->kRGBCoeffBias[0]; \ - bb = yuvconst->kRGBCoeffBias[1] + 32; \ - bg = yuvconst->kRGBCoeffBias[2] - 32; \ - br = yuvconst->kRGBCoeffBias[3] + 32; \ - } -#else -// Fill YUV -> RGB conversion constants into vectors -// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode -// register) is set to round-to-nearest-up mode(0). -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ - { \ - asm volatile ("csrwi vxrm, 0"); \ - ub = yuvconst->kUVCoeff[0]; \ - vr = yuvconst->kUVCoeff[1]; \ - ug = yuvconst->kUVCoeff[2]; \ - vg = yuvconst->kUVCoeff[3]; \ - yg = yuvconst->kRGBCoeffBias[0]; \ - bb = yuvconst->kRGBCoeffBias[1] + 32; \ - bg = yuvconst->kRGBCoeffBias[2] - 32; \ - br = yuvconst->kRGBCoeffBias[3] + 32; \ - } -#endif -// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422 -#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } - -// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444 -#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ - { \ - vuint8m2_t v_y; \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_u = __riscv_vle8_v_u8m2(src_u, vl); \ - v_v = __riscv_vle8_v_u8m2(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } - -// Convert from YUV to fixed point RGB -#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \ - v_b_16, v_r_16) \ - { \ - vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ - vuint32m8_t v_tmp5; \ - v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \ - v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \ - v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \ - v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \ - v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \ - v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \ - v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \ - v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \ - v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \ - v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \ - v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \ - v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ - } - -#ifdef LIBYUV_RVV_HAS_VXRM_ARG -// Convert from fixed point RGB To 8 bit RGB -#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ - { \ - v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, __RISCV_VXRM_RNU, vl); \ - v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, __RISCV_VXRM_RNU, vl); \ - v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, __RISCV_VXRM_RNU, vl); \ - } -#else -// Convert from fixed point RGB To 8 bit RGB -#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ - { \ - v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \ - v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ - v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ - } -#endif - -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv -#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ - { \ - vuint8m1x2_t v_tmp; \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_uv, vl); \ - v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \ - v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } - -// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu -#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ - { \ - vuint8m1x2_t v_tmp; \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_vu, vl); \ - v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \ - v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } -#else -// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv -#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } - -// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu -#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ - } -#endif - -#ifdef HAS_ARGBTOAR64ROW_RVV -void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - size_t avl = (size_t)4 * width; - do { - vuint16m8_t v_ar64; - vuint8m4_t v_argb; - size_t vl = __riscv_vsetvl_e8m4(avl); - v_argb = __riscv_vle8_v_u8m4(src_argb, vl); - v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl); - v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl); - __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl); - avl -= vl; - src_argb += vl; - dst_ar64 += vl; - } while (avl > 0); -} -#endif - -#ifdef HAS_ARGBTOAB64ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - size_t avl = (size_t)width; - do { - vuint16m2x4_t v_dst_ab64; - vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; - size_t vl = __riscv_vsetvl_e8m1(avl); - vuint8m1x4_t v_src_argb = __riscv_vlseg4e8_v_u8m1x4(src_argb, vl); - vuint8m1_t v_b = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 0); - vuint8m1_t v_g = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 1); - vuint8m1_t v_r = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 2); - vuint8m1_t v_a = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 3); - v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); - v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); - v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); - v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); - v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); - v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); - v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); - v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); - v_dst_ab64 = __riscv_vcreate_v_u16m2x4(v_r_16, v_g_16, v_b_16, v_a_16); - __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_ab64, vl); - avl -= vl; - src_argb += 4 * vl; - dst_ab64 += 4 * vl; - } while (avl > 0); -} -#else -void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - size_t avl = (size_t)width; - do { - vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; - vuint8m1_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m1(avl); - __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); - v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); - v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); - v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); - v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); - v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); - v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); - v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); - __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); - avl -= vl; - src_argb += 4 * vl; - dst_ab64 += 4 * vl; - } while (avl > 0); -} -#endif -#endif - -#ifdef HAS_AR64TOARGBROW_RVV -void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - size_t avl = (size_t)4 * width; - do { - vuint16m8_t v_ar64; - vuint8m4_t v_argb; - size_t vl = __riscv_vsetvl_e16m8(avl); - v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl); - v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl); - __riscv_vse8_v_u8m4(dst_argb, v_argb, vl); - avl -= vl; - src_ar64 += vl; - dst_argb += vl; - } while (avl > 0); -} -#endif - -#ifdef HAS_AR64TOAB64ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void AR64ToAB64Row_RVV(const uint16_t* src_ar64, - uint16_t* dst_ab64, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e16m2(w); - vuint16m2x4_t v_argb16 = __riscv_vlseg4e16_v_u16m2x4(src_ar64, vl); - vuint16m2_t v_b = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 0); - vuint16m2_t v_g = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 1); - vuint16m2_t v_r = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 2); - vuint16m2_t v_a = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 3); - vuint16m2x4_t v_dst_abgr = __riscv_vcreate_v_u16m2x4(v_r, v_g, v_b, v_a); - __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_abgr, vl); - w -= vl; - src_ar64 += vl * 4; - dst_ab64 += vl * 4; - } while (w > 0); -} -#else -void AR64ToAB64Row_RVV(const uint16_t* src_ar64, - uint16_t* dst_ab64, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e16m2(w); - vuint16m2_t v_b, v_g, v_r, v_a; - __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl); - __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl); - w -= vl; - src_ar64 += vl * 4; - dst_ab64 += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_AB64TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - size_t avl = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e16m2(avl); - vuint16m2x4_t v_abgr16 = __riscv_vlseg4e16_v_u16m2x4(src_ab64, vl); - vuint16m2_t v_r_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 0); - vuint16m2_t v_g_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 1); - vuint16m2_t v_b_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 2); - vuint16m2_t v_a_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 3); - vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); - vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); - vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); - vuint8m1_t v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); - vuint8m1x4_t v_dst_argb = __riscv_vcreate_v_u8m1x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m1x4(dst_argb, v_dst_argb, vl); - avl -= vl; - src_ab64 += 4 * vl; - dst_argb += 4 * vl; - } while (avl > 0); -} -#else -void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - size_t avl = (size_t)width; - do { - vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; - vuint8m1_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e16m2(avl); - __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); - v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); - v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); - v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); - v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); - avl -= vl; - src_ab64 += 4 * vl; - dst_argb += 4 * vl; - } while (avl > 0); -} -#endif -#endif - -#ifdef HAS_RAWTOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); - vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_raw += vl * 3; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#else -void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_b, v_g, v_r; - __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_raw += vl * 3; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RAWTORGBAROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); - vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); - __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); - w -= vl; - src_raw += vl * 3; - dst_rgba += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#else -void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_b, v_g, v_r; - __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); - __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); - w -= vl; - src_raw += vl * 3; - dst_rgba += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RAWTORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); - vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_raw += vl * 3; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); - w -= vl; - src_raw += vl * 3; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTORAWROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b); - __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl); - w -= vl; - src_argb += vl * 4; - dst_raw += vl * 3; - } while (w > 0); -} -#else -void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); - w -= vl; - src_argb += vl * 4; - dst_raw += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToRGB24Row_RVV(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_argb += vl * 4; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void ARGBToRGB24Row_RVV(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - w -= vl; - src_argb += vl * 4; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTOABGRROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); - vuint8m2x4_t v_dst_abgr = __riscv_vcreate_v_u8m2x4(v_r, v_g, v_b, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_abgr, v_dst_abgr, vl); - w -= vl; - src_argb += vl * 4; - dst_abgr += vl * 4; - } while (w > 0); -} -#else -void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a, v_r, v_g, v_b; - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl); - w -= vl; - src_argb += vl * 4; - dst_abgr += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTOBGRAROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); - vuint8m2x4_t v_dst_bgra = __riscv_vcreate_v_u8m2x4(v_a, v_r, v_g, v_b); - __riscv_vsseg4e8_v_u8m2x4(dst_bgra, v_dst_bgra, vl); - w -= vl; - src_argb += vl * 4; - dst_bgra += vl * 4; - } while (w > 0); -} -#else -void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a, v_r, v_g, v_b; - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl); - w -= vl; - src_argb += vl * 4; - dst_bgra += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTORGBAROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); - vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); - __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); - w -= vl; - src_argb += vl * 4; - dst_rgba += vl * 4; - } while (w > 0); -} -#else -void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a, v_r, v_g, v_b; - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); - w -= vl; - src_argb += vl * 4; - dst_rgba += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RGBATOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3); - vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_rgba += vl * 4; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a, v_r, v_g, v_b; - __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_rgba += vl * 4; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RGB24TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb24, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2); - vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_rgb24 += vl * 3; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#else -void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_b, v_g, v_r; - __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_rgb24 += vl * 3; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I444TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I444ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - vuint8m2x4_t v_dst_argb; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_u += vl; - src_v += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void I444ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_u += vl; - src_v += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I444ALPHATOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I444AlphaToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - vuint8m2x4_t v_dst_argb; - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m2(src_a, vl); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_a += vl; - src_u += vl; - src_v += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void I444AlphaToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m2(src_a, vl); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_a += vl; - src_u += vl; - src_v += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I444TORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I444ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - vuint8m2x3_t v_dst_rgb; - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_y += vl; - src_u += vl; - src_v += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void I444ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - w -= vl; - src_y += vl; - src_u += vl; - src_v += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I422TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I422ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - vuint8m2x4_t v_dst_argb; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void I422ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I422ALPHATOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I422AlphaToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - vuint8m2x4_t v_dst_argb; - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m2(src_a, vl); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_a += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void I422AlphaToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m2(src_a, vl); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_a += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I422TORGBAROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I422ToRGBARow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - vuint8m2x4_t v_dst_rgba; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); - __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_rgba += vl * 4; - } while (w > 0); -} -#else -void I422ToRGBARow_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_rgba += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I422TORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void I422ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - vuint8m2x3_t v_dst_rgb; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void I422ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t vl; - size_t w = (size_t)width; - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - w -= vl; - src_y += vl; - src_u += vl / 2; - src_v += vl / 2; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_I400TOARGBROW_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void I400ToARGBRow_RVV(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); - vuint8m2x4_t v_dst_argb; - vuint16m4_t v_yb; - if (is_yb_positive) { - v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); - } else { - v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl); - } - do { - vuint8m2_t v_y, v_out; - vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; - vl = __riscv_vsetvl_e8m2(w); - v_y = __riscv_vle8_v_u8m2(src_y, vl); - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); - v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y - v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl); - if (is_yb_positive) { - v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); - } else { - v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); - } - v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, __RISCV_VXRM_RNU, vl); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_out, v_out, v_out, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void I400ToARGBRow_RVV(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - vuint16m4_t v_yb; - vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) sets to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - if (is_yb_positive) { - v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); - } else { - v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl); - } - do { - vuint8m2_t v_y, v_out; - vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; - vl = __riscv_vsetvl_e8m2(w); - v_y = __riscv_vle8_v_u8m2(src_y, vl); - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); - v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y - v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl); - if (is_yb_positive) { - v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); - } else { - v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); - } - v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl); - w -= vl; - src_y += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_J400TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_y = __riscv_vle8_v_u8m2(src_y, vl); - vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_y, v_y, v_y, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#else -void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_y; - v_y = __riscv_vle8_v_u8m2(src_y, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); - w -= vl; - src_y += vl; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_COPYROW_RVV -void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m8(w); - vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl); - __riscv_vse8_v_u8m8(dst, v_data, vl); - w -= vl; - src += vl; - dst += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_NV12TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void NV12ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - vuint8m2x4_t v_dst_argb; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_uv += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void NV12ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_uv += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_NV12TORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void NV12ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint8m2x3_t v_dst_rgb; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_y += vl; - src_uv += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void NV12ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - w -= vl; - src_y += vl; - src_uv += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_NV21TOARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void NV21ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint8m2x4_t v_dst_argb; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_y += vl; - src_vu += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void NV21ToARGBRow_RVV(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_y += vl; - src_vu += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_NV21TORGB24ROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void NV21ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint8m2x3_t v_dst_rgb; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); - w -= vl; - src_y += vl; - src_vu += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#else -void NV21ToRGB24Row_RVV(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - uint8_t ub, vr, ug, vg; - int16_t yg, bb, bg, br; - vuint8m2_t v_u, v_v; - vuint8m2_t v_b, v_g, v_r; - vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); - do { - READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, - v_b_16, v_r_16); - RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - w -= vl; - src_y += vl; - src_vu += vl; - dst_rgb24 += vl * 3; - } while (w > 0); -} -#endif -#endif - -// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 -#ifdef HAS_INTERPOLATEROW_RVV -#ifdef LIBYUV_RVV_HAS_VXRM_ARG -void InterpolateRow_RVV(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* src_ptr1 = src_ptr + src_stride; - size_t dst_w = (size_t)dst_width; - assert(source_y_fraction >= 0); - assert(source_y_fraction < 256); - // Blend 100 / 0 - Copy row unchanged. - if (y1_fraction == 0) { - do { - size_t vl = __riscv_vsetvl_e8m8(dst_w); - __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); - dst_w -= vl; - src_ptr += vl; - dst_ptr += vl; - } while (dst_w > 0); - return; - } - // Blend 50 / 50. - if (y1_fraction == 128) { - do { - size_t vl = __riscv_vsetvl_e8m8(dst_w); - vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); - vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); - vuint8m8_t row_out = - __riscv_vaaddu_vv_u8m8(row0, row1, __RISCV_VXRM_RNU, vl); - __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); - dst_w -= vl; - src_ptr += vl; - src_ptr1 += vl; - dst_ptr += vl; - } while (dst_w > 0); - return; - } - // General purpose row blend. - do { - size_t vl = __riscv_vsetvl_e8m4(dst_w); - vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); - vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); - vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); - acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); - __riscv_vse8_v_u8m4( - dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, __RISCV_VXRM_RNU, vl), vl); - dst_w -= vl; - src_ptr += vl; - src_ptr1 += vl; - dst_ptr += vl; - } while (dst_w > 0); -} -#else -void InterpolateRow_RVV(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* src_ptr1 = src_ptr + src_stride; - size_t dst_w = (size_t)dst_width; - assert(source_y_fraction >= 0); - assert(source_y_fraction < 256); - // Blend 100 / 0 - Copy row unchanged. - if (y1_fraction == 0) { - do { - size_t vl = __riscv_vsetvl_e8m8(dst_w); - __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); - dst_w -= vl; - src_ptr += vl; - dst_ptr += vl; - } while (dst_w > 0); - return; - } - // To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up(0). - asm volatile ("csrwi vxrm, 0"); - // Blend 50 / 50. - if (y1_fraction == 128) { - do { - size_t vl = __riscv_vsetvl_e8m8(dst_w); - vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); - vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); - // Use round-to-nearest-up mode for averaging add - vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); - __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); - dst_w -= vl; - src_ptr += vl; - src_ptr1 += vl; - dst_ptr += vl; - } while (dst_w > 0); - return; - } - // General purpose row blend. - do { - size_t vl = __riscv_vsetvl_e8m4(dst_w); - vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); - vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); - vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); - acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); - // Use round-to-nearest-up mode for vnclip - __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); - dst_w -= vl; - src_ptr += vl; - src_ptr1 += vl; - dst_ptr += vl; - } while (dst_w > 0); -} -#endif -#endif - -#ifdef HAS_SPLITRGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void SplitRGBRow_RVV(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x3_t v_src = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src, 1); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src, 2); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_rgb += vl * 3; - } while (w > 0); -} -#else -void SplitRGBRow_RVV(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_rgb += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_MERGERGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void MergeRGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); - vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); - vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); - vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b); - __riscv_vsseg3e8_v_u8m2x3(dst_rgb, v_dst, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - dst_rgb += vl * 3; - } while (w > 0); -} -#else -void MergeRGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); - vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); - vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); - __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - dst_rgb += vl * 3; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SPLITARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void SplitARGBRow_RVV(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src, 3); - __riscv_vse8_v_u8m2(dst_a, v_a, vl); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_a += vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_argb += vl * 4; - } while (w > 0); -} -#else -void SplitARGBRow_RVV(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vse8_v_u8m2(dst_a, v_a, vl); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_a += vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_MERGEARGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void MergeARGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); - vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); - vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); - vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); - vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - src_a += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void MergeARGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); - vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); - vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); - vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - src_a += vl; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SPLITXRGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void SplitXRGBRow_RVV(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_argb += vl * 4; - } while (w > 0); -} -#else -void SplitXRGBRow_RVV(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vse8_v_u8m2(dst_r, v_r, vl); - __riscv_vse8_v_u8m2(dst_g, v_g, vl); - __riscv_vse8_v_u8m2(dst_b, v_b, vl); - w -= vl; - dst_r += vl; - dst_g += vl; - dst_b += vl; - src_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_MERGEXRGBROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void MergeXRGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); - vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); - vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); - vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#else -void MergeXRGBRow_RVV(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - do { - vuint8m2_t v_r, v_g, v_b; - v_r = __riscv_vle8_v_u8m2(src_r, vl); - v_g = __riscv_vle8_v_u8m2(src_g, vl); - v_b = __riscv_vle8_v_u8m2(src_b, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_r += vl; - src_g += vl; - src_b += vl; - dst_argb += vl * 4; - vl = __riscv_vsetvl_e8m2(w); - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SPLITUVROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void SplitUVRow_RVV(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_uv, vl); - vuint8m4_t v_u = __riscv_vget_v_u8m4x2_u8m4(v_src, 0); - vuint8m4_t v_v = __riscv_vget_v_u8m4x2_u8m4(v_src, 1); - __riscv_vse8_v_u8m4(dst_u, v_u, vl); - __riscv_vse8_v_u8m4(dst_v, v_v, vl); - w -= vl; - dst_u += vl; - dst_v += vl; - src_uv += 2 * vl; - } while (w > 0); -} -#else -void SplitUVRow_RVV(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4_t v_u, v_v; - __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); - __riscv_vse8_v_u8m4(dst_u, v_u, vl); - __riscv_vse8_v_u8m4(dst_v, v_v, vl); - w -= vl; - dst_u += vl; - dst_v += vl; - src_uv += 2 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_MERGEUVROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void MergeUVRow_RVV(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4_t v_u = __riscv_vle8_v_u8m4(src_u, vl); - vuint8m4_t v_v = __riscv_vle8_v_u8m4(src_v, vl); - vuint8m4x2_t v_dst = __riscv_vcreate_v_u8m4x2(v_u, v_v); - __riscv_vsseg2e8_v_u8m4x2(dst_uv, v_dst, vl); - w -= vl; - src_u += vl; - src_v += vl; - dst_uv += 2 * vl; - } while (w > 0); -} -#else -void MergeUVRow_RVV(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - size_t w = (size_t)width; - do { - vuint8m4_t v_u, v_v; - size_t vl = __riscv_vsetvl_e8m4(w); - v_u = __riscv_vle8_v_u8m4(src_u, vl); - v_v = __riscv_vle8_v_u8m4(src_v, vl); - __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); - w -= vl; - src_u += vl; - src_v += vl; - dst_uv += 2 * vl; - } while (w > 0); -} -#endif -#endif - -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; - uint16_t pad; -}; - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; - -// ARGB expects first 3 values to contain RGB and 4th value is ignored -#ifdef HAS_ARGBTOYMATRIXROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_argb += 4 * vl; - dst_y += vl; - } while (w > 0); -} -#else -static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_b, v_g, v_r, v_a, v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_argb += 4 * vl; - dst_y += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBTOYROW_RVV -void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); -} -#endif - -#ifdef HAS_ARGBTOYJROW_RVV -void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); -} -#endif - -#ifdef HAS_ABGRTOYROW_RVV -void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); -} -#endif - -#ifdef HAS_ABGRTOYJROW_RVV -void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); -} -#endif - -// RGBA expects first value to be A and ignored, then 3 values to contain RGB. -#ifdef HAS_RGBATOYMATRIXROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_rgba += 4 * vl; - dst_y += vl; - } while (w > 0); -} -#else -static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_b, v_g, v_r, v_a, v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_rgba += 4 * vl; - dst_y += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RGBATOYROW_RVV -void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); -} -#endif - -#ifdef HAS_RGBATOYJROW_RVV -void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); -} -#endif - -#ifdef HAS_BGRATOYROW_RVV -void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); -} -#endif - -#ifdef HAS_RGBTOYMATRIXROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_rgb += 3 * vl; - dst_y += vl; - } while (w > 0); -} -#else -static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - assert(width != 0); - size_t w = (size_t)width; - vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant - vuint16m4_t v_addy; // vector is to store kAddY - size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); - do { - vuint8m2_t v_b, v_g, v_r, v_y; - vuint16m4_t v_y_u16; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); - v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); - v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); - v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); - v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); - __riscv_vse8_v_u8m2(dst_y, v_y, vl); - w -= vl; - src_rgb += 3 * vl; - dst_y += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_RGB24TOYJROW_RVV -void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); -} -#endif - -#ifdef HAS_RAWTOYJROW_RVV -void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); -} -#endif - -#ifdef HAS_RGB24TOYROW_RVV -void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); -} -#endif - -#ifdef HAS_RAWTOYROW_RVV -void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); -} -#endif - -// Blend src_argb over src_argb1 and store to dst_argb. -// dst_argb may be src_argb or src_argb1. -// src_argb: RGB values have already been pre-multiplied by the a. -#ifdef HAS_ARGBBLENDROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBBlendRow_RVV(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvlmax_e8m2(); - // clamp255((((256 - a) * b) >> 8) + f) - // = b * (256 - a) / 256 + f - // = b - (b * a / 256) + f - vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); - do { - vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; - vuint8m2_t v_dst_b, v_dst_g, v_dst_r; - vuint8m2x4_t v_dst_argb; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src0_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_src0_b = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 0); - vuint8m2_t v_src0_g = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 1); - vuint8m2_t v_src0_r = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 2); - vuint8m2_t v_src0_a = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 3); - vuint8m2x4_t v_src1_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb1, vl); - vuint8m2_t v_src1_b = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 0); - vuint8m2_t v_src1_g = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 1); - vuint8m2_t v_src1_r = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 2); - - v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); - v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); - v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); - - v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); - v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); - v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); - - v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); - v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); - v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); - - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_dst_b, v_dst_g, v_dst_r, v_255); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - - w -= vl; - src_argb += 4 * vl; - src_argb1 += 4 * vl; - dst_argb += 4 * vl; - } while (w > 0); -} -#else -void ARGBBlendRow_RVV(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - size_t vl = __riscv_vsetvlmax_e8m2(); - // clamp255((((256 - a) * b) >> 8) + f) - // = b * (256 - a) / 256 + f - // = b - (b * a / 256) + f - vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); - do { - vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; - vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; - vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; - vuint8m2_t v_dst_b, v_dst_g, v_dst_r; - vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, - src_argb, vl); - __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, - src_argb1, vl); - - v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); - v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); - v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); - - v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); - v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); - v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); - - v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); - v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); - v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); - - w -= vl; - src_argb += 4 * vl; - src_argb1 += 4 * vl; - dst_argb += 4 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_BLENDPLANEROW_RVV -void BlendPlaneRow_RVV(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - size_t w = (size_t)width; - do { - vuint16m8_t v_dst_u16; - vuint8m4_t v_dst; - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); - vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); - vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl); - vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl); - - // (a * foreground) + (1-a) * background - v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl); - v_dst_u16 = - __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl); - v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl); - v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl); - - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - src0 += vl; - src1 += vl; - alpha += vl; - dst += vl; - } while (w > 0); -} -#endif - -// Attenuate: (f * a + 255) >> 8 -#ifdef HAS_ARGBATTENUATEROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBAttenuateRow_RVV(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - do { - vuint16m4_t v_ba_16, v_ga_16, v_ra_16; - vuint8m2x4_t v_dst_argb; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); - vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); - vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); - // f * a - v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); - v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); - v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); - // f * a + 255 - v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); - v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); - v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); - // (f * a + 255) >> 8 - v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); - v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); - v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); - - v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); - __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); - w -= vl; - src_argb += vl * 4; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void ARGBAttenuateRow_RVV(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - size_t w = (size_t)width; - do { - vuint8m2_t v_b, v_g, v_r, v_a; - vuint16m4_t v_ba_16, v_ga_16, v_ra_16; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - // f * a - v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); - v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); - v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); - // f * a + 255 - v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); - v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); - v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); - // (f * a + 255) >> 8 - v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); - v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); - v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); - __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - w -= vl; - src_argb += vl * 4; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBEXTRACTALPHAROW_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); - __riscv_vse8_v_u8m2(dst_a, v_a, vl); - w -= vl; - src_argb += vl * 4; - dst_a += vl; - } while (w > 0); -} -#else -void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - size_t w = (size_t)width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_b, v_g, v_r, v_a; - __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); - __riscv_vse8_v_u8m2(dst_a, v_a, vl); - w -= vl; - src_argb += vl * 4; - dst_a += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV -void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) { - size_t w = (size_t)width; - const ptrdiff_t dst_stride = 4; - dst += 3; - do { - size_t vl = __riscv_vsetvl_e8m8(w); - vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl); - __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl); - w -= vl; - src += vl; - dst += vl * dst_stride; - } while (w > 0); -} -#endif - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && - // defined(__clang__) diff --git a/drivers/media/pci/tbscapture2/row_sve.c b/drivers/media/pci/tbscapture2/row_sve.c deleted file mode 100644 index 0290949952df..000000000000 --- a/drivers/media/pci/tbscapture2/row_sve.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright 2024 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) - -#define READYUV444_SVE \ - "ld1b {z0.h}, p1/z, [%[src_y]] \n" \ - "ld1b {z1.h}, p1/z, [%[src_u]] \n" \ - "ld1b {z2.h}, p1/z, [%[src_v]] \n" \ - "add %[src_y], %[src_y], %[vl] \n" \ - "add %[src_u], %[src_u], %[vl] \n" \ - "add %[src_v], %[src_v], %[vl] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_u], 448] \n" \ - "trn1 z0.b, z0.b, z0.b \n" \ - "prfm pldl1keep, [%[src_v], 448] \n" - -#define READYUV422_SVE \ - "ld1b {z0.h}, p1/z, [%[src_y]] \n" \ - "ld1b {z1.s}, p1/z, [%[src_u]] \n" \ - "ld1b {z2.s}, p1/z, [%[src_v]] \n" \ - "inch %[src_y] \n" \ - "incw %[src_u] \n" \ - "incw %[src_v] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" \ - "trn1 z0.b, z0.b, z0.b \n" \ - "trn1 z1.h, z1.h, z1.h \n" \ - "trn1 z2.h, z2.h, z2.h \n" - -// Read twice as much data from YUV, putting the even elements from the Y data -// in z0.h and odd elements in z1.h. U/V data is not duplicated, stored in -// z2.h/z3.h. -#define READYUV422_SVE_2X \ - "ld1b {z0.b}, p1/z, [%[src_y]] \n" \ - "ld1b {z2.h}, p1/z, [%[src_u]] \n" \ - "ld1b {z3.h}, p1/z, [%[src_v]] \n" \ - "incb %[src_y] \n" \ - "inch %[src_u] \n" \ - "inch %[src_v] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" \ - "trn2 z1.b, z0.b, z0.b \n" \ - "trn1 z0.b, z0.b, z0.b \n" - -#define READYUV400_SVE \ - "ld1b {z0.h}, p1/z, [%[src_y]] \n" \ - "inch %[src_y] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "trn1 z0.b, z0.b, z0.b \n" - -// We need a different predicate for the UV component to handle the tail. -// If there is a single element remaining then we want to load one Y element -// but two UV elements. -#define READNV_SVE \ - "ld1b {z0.h}, p1/z, [%[src_y]] \n" /* Y0Y0 */ \ - "ld1b {z1.h}, p2/z, [%[src_uv]] \n" /* U0V0 or V0U0 */ \ - "inch %[src_y] \n" \ - "inch %[src_uv] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_uv], 256] \n" \ - "trn1 z0.b, z0.b, z0.b \n" /* YYYY */ \ - "tbl z1.b, {z1.b}, z22.b \n" /* UVUV */ - -#define READYUY2_SVE \ - "ld1w {z0.s}, p2/z, [%[src_yuy2]] \n" /* YUYV */ \ - "incb %[src_yuy2] \n" \ - "prfm pldl1keep, [%[src_yuy2], 448] \n" \ - "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \ - "trn1 z0.b, z0.b, z0.b \n" /* YYYY */ - -#define READUYVY_SVE \ - "ld1w {z0.s}, p2/z, [%[src_uyvy]] \n" /* UYVY */ \ - "incb %[src_uyvy] \n" \ - "prfm pldl1keep, [%[src_uyvy], 448] \n" \ - "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \ - "trn2 z0.b, z0.b, z0.b \n" /* YYYY */ - -#define YUVTORGB_SVE_SETUP \ - "ld1rb {z28.b}, p0/z, [%[kUVCoeff], #0] \n" \ - "ld1rb {z29.b}, p0/z, [%[kUVCoeff], #1] \n" \ - "ld1rb {z30.b}, p0/z, [%[kUVCoeff], #2] \n" \ - "ld1rb {z31.b}, p0/z, [%[kUVCoeff], #3] \n" \ - "ld1rh {z24.h}, p0/z, [%[kRGBCoeffBias], #0] \n" \ - "ld1rh {z25.h}, p0/z, [%[kRGBCoeffBias], #2] \n" \ - "ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \ - "ld1rh {z27.h}, p0/z, [%[kRGBCoeffBias], #6] \n" - -// Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1 -// rather than widened .h elements of z1/z2. -#define NVTORGB_SVE \ - "umulh z0.h, z24.h, z0.h \n" /* Y */ \ - "umullb z6.h, z30.b, z1.b \n" \ - "umullb z4.h, z28.b, z1.b \n" /* DB */ \ - "umullt z5.h, z29.b, z1.b \n" /* DR */ \ - "umlalt z6.h, z31.b, z1.b \n" /* DG */ \ - "add z17.h, z0.h, z26.h \n" /* G */ \ - "add z16.h, z0.h, z4.h \n" /* B */ \ - "add z18.h, z0.h, z5.h \n" /* R */ \ - "uqsub z17.h, z17.h, z6.h \n" /* G */ \ - "uqsub z16.h, z16.h, z25.h \n" /* B */ \ - "uqsub z18.h, z18.h, z27.h \n" /* R */ - -// Like NVTORGB_SVE but U/V components are stored in widened .h elements of -// z1/z2 rather than even/odd .b lanes of z1. -#define I4XXTORGB_SVE \ - "umulh z0.h, z24.h, z0.h \n" /* Y */ \ - "umullb z6.h, z30.b, z1.b \n" \ - "umullb z4.h, z28.b, z1.b \n" /* DB */ \ - "umullb z5.h, z29.b, z2.b \n" /* DR */ \ - "umlalb z6.h, z31.b, z2.b \n" /* DG */ \ - "add z17.h, z0.h, z26.h \n" /* G */ \ - "add z16.h, z0.h, z4.h \n" /* B */ \ - "add z18.h, z0.h, z5.h \n" /* R */ \ - "uqsub z17.h, z17.h, z6.h \n" /* G */ \ - "uqsub z16.h, z16.h, z25.h \n" /* B */ \ - "uqsub z18.h, z18.h, z27.h \n" /* R */ - -// The U/V component multiplies do not need to be duplicated in I422, we just -// need to combine them with Y0/Y1 correctly. -#define I422TORGB_SVE_2X \ - "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \ - "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \ - "umullb z6.h, z30.b, z2.b \n" \ - "umullb z4.h, z28.b, z2.b \n" /* DB */ \ - "umullb z5.h, z29.b, z3.b \n" /* DR */ \ - "umlalb z6.h, z31.b, z3.b \n" /* DG */ \ - \ - "add z17.h, z0.h, z26.h \n" /* G0 */ \ - "add z21.h, z1.h, z26.h \n" /* G1 */ \ - "add z16.h, z0.h, z4.h \n" /* B0 */ \ - "add z20.h, z1.h, z4.h \n" /* B1 */ \ - "add z18.h, z0.h, z5.h \n" /* R0 */ \ - "add z22.h, z1.h, z5.h \n" /* R1 */ \ - "uqsub z17.h, z17.h, z6.h \n" /* G0 */ \ - "uqsub z21.h, z21.h, z6.h \n" /* G1 */ \ - "uqsub z16.h, z16.h, z25.h \n" /* B0 */ \ - "uqsub z20.h, z20.h, z25.h \n" /* B1 */ \ - "uqsub z18.h, z18.h, z27.h \n" /* R0 */ \ - "uqsub z22.h, z22.h, z27.h \n" /* R1 */ - -#define I400TORGB_SVE \ - "umulh z18.h, z24.h, z0.h \n" /* Y */ \ - "movprfx z16, z18 \n" \ - "usqadd z16.h, p0/m, z16.h, z4.h \n" /* B */ \ - "movprfx z17, z18 \n" \ - "usqadd z17.h, p0/m, z17.h, z6.h \n" /* G */ \ - "usqadd z18.h, p0/m, z18.h, z5.h \n" /* R */ - -// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA -// pairs to allow us to use ST2 for storing rather than ST4. -#define RGBTOARGB8_SVE \ - /* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \ - "uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \ - "uqshrnb z18.b, z18.h, #6 \n" /* R0 */ \ - "uqshrnt z16.b, z17.h, #6 \n" /* BG */ \ - "trn1 z17.b, z18.b, z19.b \n" /* RA */ - -#define RGBTOARGB8_SVE_2X \ - /* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \ - "uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \ - "uqshrnb z17.b, z17.h, #6 \n" /* G0 */ \ - "uqshrnb z18.b, z18.h, #6 \n" /* R0 */ \ - "uqshrnt z16.b, z20.h, #6 \n" /* B1 */ \ - "uqshrnt z17.b, z21.h, #6 \n" /* G1 */ \ - "uqshrnt z18.b, z22.h, #6 \n" /* R1 */ - -// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR -// pairs to allow us to use ST2 for storing rather than ST4. -#define RGBTORGBA8_SVE \ - /* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \ - "uqshrnt z19.b, z16.h, #6 \n" /* AB */ \ - "uqshrnb z20.b, z17.h, #6 \n" /* G0 */ \ - "uqshrnt z20.b, z18.h, #6 \n" /* GR */ - -#define YUVTORGB_SVE_REGS \ - "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \ - "z20", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", \ - "z31", "p0", "p1", "p2", "p3" - -void I444ToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile ( - "cnth %[vl] \n" - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "dup z19.b, #255 \n" /* A */ - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "1: \n" READYUV444_SVE - I4XXTORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE - I4XXTORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -void I400ToARGBRow_SVE2(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile ( - "cnth %[vl] \n" - "ptrue p0.b \n" - "dup z19.b, #255 \n" // A - YUVTORGB_SVE_SETUP - "cmp %w[width], %w[vl] \n" - "mov z1.h, #128 \n" // U/V - "umullb z6.h, z30.b, z1.b \n" - "umullb z4.h, z28.b, z1.b \n" // DB - "umullb z5.h, z29.b, z1.b \n" // DR - "mla z6.h, p0/m, z31.h, z1.h \n" // DG - "sub z4.h, z4.h, z25.h \n" - "sub z5.h, z5.h, z27.h \n" - "sub z6.h, z26.h, z6.h \n" - "b.le 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "sub %w[width], %w[width], %w[vl] \n" - "1: \n" // - READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.gt 1b \n" - "add %w[width], %w[width], %w[vl] \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "2: \n" - "whilelt p1.h, wzr, %w[width] \n" // - READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - : [src_y] "+r"(src_y), // %[src_y] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -void I422ToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile( - "cntb %[vl] \n" - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "dup z19.b, #255 \n" /* A0 */ - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.b \n" - "1: \n" READYUV422_SVE_2X - I422TORGB_SVE_2X RGBTOARGB8_SVE_2X - "subs %w[width], %w[width], %w[vl] \n" - "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n" - "incb %[dst_argb], all, mul #4 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "cnth %[vl] \n" - "whilelt p1.b, wzr, %w[width] \n" READYUV422_SVE_2X - I422TORGB_SVE_2X RGBTOARGB8_SVE_2X - "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -void I422ToRGBARow_SVE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile ( - "cnth %[vl] \n" - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "dup z19.b, #255 \n" // A - "subs %w[width], %w[width], %w[vl] \n" - "b.le 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "1: \n" // - READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.gt 1b \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - "whilelt p1.h, wzr, %w[width] \n" // - READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE - "st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -void I444AlphaToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile ( - "cnth %[vl] \n" - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "1: \n" READYUV444_SVE - "ld1b {z19.h}, p1/z, [%[src_a]] \n" - "add %[src_a], %[src_a], %[vl] \n" // A - I4XXTORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE - "ld1b {z19.h}, p1/z, [%[src_a]] \n" // A - I4XXTORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -void I422AlphaToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t vl; - asm volatile( - "cntb %[vl] \n" - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.b \n" - "1: \n" READYUV422_SVE_2X - "ld1b {z19.b}, p1/z, [%[src_a]] \n" - "add %[src_a], %[src_a], %[vl] \n" // A - I422TORGB_SVE_2X RGBTOARGB8_SVE_2X - "subs %w[width], %w[width], %w[vl] \n" - "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n" - "incb %[dst_argb], all, mul #4 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "cnth %[vl] \n" - "whilelt p1.b, wzr, %w[width] \n" READYUV422_SVE_2X - "ld1b {z19.b}, p1/z, [%[src_a]] \n" // A - I422TORGB_SVE_2X RGBTOARGB8_SVE_2X - "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_SVE_REGS); -} - -static inline void NVToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width, - uint32_t nv_uv_start, - uint32_t nv_uv_step) { - uint64_t vl; - asm("cnth %0" : "=r"(vl)); - int width_last_y = width & (vl - 1); - int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile( - "ptrue p0.b \n" // - YUVTORGB_SVE_SETUP - "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" - "dup z19.b, #255 \n" // A - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "ptrue p2.h \n" - "1: \n" // - READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "3: \n" - "whilelt p1.h, wzr, %w[width_last_y] \n" - "whilelt p2.h, wzr, %w[width_last_uv] \n" // - READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [vl] "r"(vl), // %[vl] - [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] - [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] - [width_last_y] "r"(width_last_y), // %[width_last_y] - [width_last_uv] "r"(width_last_uv) // %[width_last_uv] - : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); -} - -void NV12ToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint32_t nv_uv_start = 0x02000200U; - uint32_t nv_uv_step = 0x04040404U; - NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start, - nv_uv_step); -} - -void NV21ToARGBRow_SVE2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint32_t nv_uv_start = 0x00020002U; - uint32_t nv_uv_step = 0x04040404U; - NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start, - nv_uv_step); -} - -// Dot-product constants are stored as four-tuples with the two innermost -// elements flipped to account for the interleaving nature of the widening -// addition instructions. - -static const int16_t kARGBToUVCoefficients[] = { - // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 56, -19, -37, 0, -9, 56, -47, 0, -}; - -static const int16_t kRGBAToUVCoefficients[] = { - // 0, -UG, UB, -UR, 0, -VG, -VB, VR - 0, -37, 56, -19, 0, -47, -9, 56, -}; - -static const int16_t kBGRAToUVCoefficients[] = { - // 0, -UG, -UR, UB, 0, -VG, VR, -VB - 0, -37, -19, 56, 0, -47, 56, -9, -}; - -static const int16_t kABGRToUVCoefficients[] = { - // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -19, 56, -37, 0, 56, -9, -47, 0, -}; - -static const int16_t kARGBToUVJCoefficients[] = { - // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 63, -21, -42, 0, -10, 63, -53, 0, -}; - -static const int16_t kABGRToUVJCoefficients[] = { - // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -21, 63, -42, 0, 63, -10, -53, 0, -}; - -static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const int16_t* uvconstants) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - uint64_t vl; - asm volatile ( - "ptrue p0.b \n" - "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" - "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" - "mov z26.b, #0x80 \n" - - "cntb %[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Process 4x vectors from each input row per iteration. - // Cannot use predication here due to unrolling. - "1: \n" // e.g. - "ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra - "ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra - "ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra - "ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra - "ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra - "ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra - "ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra - "ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra - "incb %[src0], all, mul #4 \n" - "incb %[src1], all, mul #4 \n" - - "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr - "uaddlt z17.h, z0.b, z4.b \n" // gagagaga - "uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr - "uaddlt z19.h, z1.b, z5.b \n" // gagagaga - "uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr - "uaddlt z21.h, z2.b, z6.b \n" // gagagaga - "uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr - "uaddlt z23.h, z3.b, z7.b \n" // gagagaga - - "trn1 z0.s, z16.s, z17.s \n" // brgabgra - "trn2 z1.s, z16.s, z17.s \n" // brgabgra - "trn1 z2.s, z18.s, z19.s \n" // brgabgra - "trn2 z3.s, z18.s, z19.s \n" // brgabgra - "trn1 z4.s, z20.s, z21.s \n" // brgabgra - "trn2 z5.s, z20.s, z21.s \n" // brgabgra - "trn1 z6.s, z22.s, z23.s \n" // brgabgra - "trn2 z7.s, z22.s, z23.s \n" // brgabgra - - "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop - - "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga - "urhadd z2.h, p0/m, z2.h, z3.h \n" // brgabrga - "urhadd z4.h, p0/m, z4.h, z5.h \n" // brgabrga - "urhadd z6.h, p0/m, z6.h, z7.h \n" // brgabrga - - "movi v16.8h, #0 \n" - "movi v17.8h, #0 \n" - "movi v18.8h, #0 \n" - "movi v19.8h, #0 \n" - - "movi v20.8h, #0 \n" - "movi v21.8h, #0 \n" - "movi v22.8h, #0 \n" - "movi v23.8h, #0 \n" - - "sdot z16.d, z0.h, z24.h \n" // UUxxxxxx - "sdot z17.d, z2.h, z24.h \n" // UUxxxxxx - "sdot z18.d, z4.h, z24.h \n" // UUxxxxxx - "sdot z19.d, z6.h, z24.h \n" // UUxxxxxx - - "sdot z20.d, z0.h, z25.h \n" // VVxxxxxx - "sdot z21.d, z2.h, z25.h \n" // VVxxxxxx - "sdot z22.d, z4.h, z25.h \n" // VVxxxxxx - "sdot z23.d, z6.h, z25.h \n" // VVxxxxxx - - "uzp1 z16.s, z16.s, z17.s \n" // UUxx - "uzp1 z18.s, z18.s, z19.s \n" // UUxx - "uzp1 z20.s, z20.s, z21.s \n" // VVxx - "uzp1 z22.s, z22.s, z23.s \n" // VVxx - - "uzp1 z16.h, z16.h, z18.h \n" // UU - "uzp1 z20.h, z20.h, z22.h \n" // VV - - "addhnb z16.b, z16.h, z26.h \n" // U - "addhnb z20.b, z20.h, z26.h \n" // V - - "st1b {z16.h}, p0, [%[dst_u]] \n" // U - "st1b {z20.h}, p0, [%[dst_v]] \n" // V - "inch %[dst_u] \n" - "inch %[dst_v] \n" - - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" // VL per loop - "b.le 99f \n" - - // Process remaining pixels from each input row. - // Use predication to do one vector from each input array, so may loop up - // to three iterations. - "cntw %x[vl] \n" - - "3: \n" - "whilelt p1.s, wzr, %w[width] \n" - "ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra - "ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra - "incb %[src0] \n" - "incb %[src1] \n" - - "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr - "uaddlt z17.h, z0.b, z4.b \n" // gagagaga - - "trn1 z0.s, z16.s, z17.s \n" // brgabgra - "trn2 z1.s, z16.s, z17.s \n" // brgabgra - - "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga - - "subs %w[width], %w[width], %w[vl] \n" // VL per loop - - "movi v16.8h, #0 \n" - "movi v20.8h, #0 \n" - - "sdot z16.d, z0.h, z24.h \n" - "sdot z20.d, z0.h, z25.h \n" - - "addhnb z16.b, z16.h, z26.h \n" // U - "addhnb z20.b, z20.h, z26.h \n" // V - - "st1b {z16.d}, p0, [%[dst_u]] \n" // U - "st1b {z20.d}, p0, [%[dst_v]] \n" // V - "incd %[dst_u] \n" - "incd %[dst_v] \n" - "b.gt 3b \n" - - "99: \n" - : [src0] "+r"(src_argb), // %[src0] - [src1] "+r"(src_argb_1), // %[src1] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [uvconstants] "r"(uvconstants) - : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", - "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", - "p0"); -} - -void ARGBToUVRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, - kARGBToUVCoefficients); -} - -void ARGBToUVJRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, - kARGBToUVJCoefficients); -} - -void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_uj, - uint8_t* dst_vj, - int width) { - ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, - kABGRToUVJCoefficients); -} - -void BGRAToUVRow_SVE2(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width, - kBGRAToUVCoefficients); -} - -void ABGRToUVRow_SVE2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width, - kABGRToUVCoefficients); -} - -void RGBAToUVRow_SVE2(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width, - kRGBAToUVCoefficients); -} - -#define ARGBTORGB565_SVE \ - /* Inputs: \ - * z0: rrrrrxxxbbbbbxxx \ - * z1: xxxxxxxxggggggxx \ - * z3: 0000000000000011 (3, 0, 3, 0, ...) \ - * z4: 0000011111100000 \ - */ \ - "lsr z0.b, p0/m, z0.b, z3.b \n" \ - "lsl z1.h, z1.h, #3 \n" \ - "bsl z1.d, z1.d, z0.d, z4.d \n" - -void ARGBToRGB565Row_SVE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - unsigned bsl_mask = 0x7e0; - uint64_t vl; - width *= 2; - asm volatile ( - "mov z3.h, #3 \n" - "dup z4.h, %w[bsl_mask] \n" - - "cntb %[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - "ptrue p0.b \n" - "1: \n" - "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA - "incb %[src], all, mul #2 \n" - "subs %w[width], %w[width], %w[vl] \n" ARGBTORGB565_SVE - "st1b {z1.b}, p0, [%[dst]] \n" - "incb %[dst] \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - "whilelt p0.b, wzr, %w[width] \n" - "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA - ARGBTORGB565_SVE - "st1b {z1.b}, p0, [%[dst]] \n" - - "99: \n" - : [src] "+r"(src_argb), // %[src] - [dst] "+r"(dst_rgb), // %[dst] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [bsl_mask] "r"(bsl_mask) // %[bsl_mask] - : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); -} - -void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - unsigned bsl_mask = 0x7e0; - uint64_t vl; - width *= 2; - asm volatile ( - "mov z3.h, #3 \n" - "dup z4.h, %w[bsl_mask] \n" - "dup z2.s, %w[dither4] \n" - "zip1 z2.b, z2.b, z2.b \n" - - "cntb %[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - "ptrue p0.b \n" - "1: \n" - "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA - "incb %[src], all, mul #2 \n" - "uqadd z0.b, z0.b, z2.b \n" - "uqadd z1.b, z1.b, z2.b \n" - "subs %w[width], %w[width], %w[vl] \n" ARGBTORGB565_SVE - "st1b {z1.b}, p0, [%[dst]] \n" - "incb %[dst] \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - "whilelt p0.b, wzr, %w[width] \n" - "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA - "uqadd z0.b, z0.b, z2.b \n" - "uqadd z1.b, z1.b, z2.b \n" ARGBTORGB565_SVE - "st1b {z1.b}, p0, [%[dst]] \n" - - "99: \n" - : [src] "+r"(src_argb), // %[src] - [dst] "+r"(dst_rgb), // %[dst] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [bsl_mask] "r"(bsl_mask), // %[bsl_mask] - [dither4] "r"(dither4) // %[dither4] - : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); -} - -#define ARGB1555TOARGB \ - /* Input: z1/z3.h = arrrrrgggggbbbbb */ \ - "lsl z0.h, z1.h, #3 \n" /* rrrgggggbbbbb000 */ \ - "lsl z2.h, z3.h, #3 \n" /* rrrgggggbbbbb000 */ \ - "asr z1.h, z1.h, #7 \n" /* aaaaaaaarrrrrggg */ \ - "asr z3.h, z3.h, #7 \n" /* aaaaaaaarrrrrggg */ \ - "lsl z0.b, p0/m, z0.b, z4.b \n" /* ggggg000bbbbb000 */ \ - "lsl z2.b, p0/m, z2.b, z4.b \n" /* ggggg000bbbbb000 */ \ - "sri z1.b, z1.b, #5 \n" /* aaaaaaaarrrrrrrr */ \ - "sri z3.b, z3.b, #5 \n" /* aaaaaaaarrrrrrrr */ \ - "sri z0.b, z0.b, #5 \n" /* ggggggggbbbbbbbb */ \ - "sri z2.b, z2.b, #5 \n" /* ggggggggbbbbbbbb */ - -void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - uint64_t vl; - asm volatile ( - "mov z4.h, #0x0300 \n" - "ptrue p0.b \n" - - "cnth %x[vl] \n" - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "b.lt 2f \n" - - "1: \n" - "ld1h {z1.h}, p0/z, [%[src]] \n" - "ld1h {z3.h}, p0/z, [%[src], #1, mul vl] \n" - "incb %[src], all, mul #2 \n" ARGB1555TOARGB - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "st2h {z0.h, z1.h}, p0, [%[dst]] \n" - "st2h {z2.h, z3.h}, p0, [%[dst], #2, mul vl] \n" - "incb %[dst], all, mul #4 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl], lsl #1 \n" - "b.eq 99f \n" - - "whilelt p1.h, wzr, %w[width] \n" - "whilelt p2.h, %w[vl], %w[width] \n" - "ld1h {z1.h}, p1/z, [%[src]] \n" - "ld1h {z3.h}, p2/z, [%[src], #1, mul vl] \n" ARGB1555TOARGB - "st2h {z0.h, z1.h}, p1, [%[dst]] \n" - "st2h {z2.h, z3.h}, p2, [%[dst], #2, mul vl] \n" - - "99: \n" - : [src] "+r"(src_argb1555), // %[src] - [dst] "+r"(dst_argb), // %[dst] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : - : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2"); -} - -// clang-format off -#define AYUVTOUV_SVE(zU0, zV0, zU1, zV1) /* e.g. */ \ - "ld2h {z0.h, z1.h}, p0/z, [%[src0]] \n" /* VUVU.. YAYA.. */ \ - "ld2h {z1.h, z2.h}, p1/z, [%[src0], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ - "ld2h {z2.h, z3.h}, p0/z, [%[src1]] \n" /* VUVU.. YAYA.. */ \ - "ld2h {z3.h, z4.h}, p1/z, [%[src1], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ - "incb %[src0], all, mul #4 \n" \ - "incb %[src1], all, mul #4 \n" \ - "uaddlb z4.h, z0.b, z2.b \n" /* V */ \ - "uaddlt z5.h, z0.b, z2.b \n" /* U */ \ - "uaddlb z6.h, z1.b, z3.b \n" /* V */ \ - "uaddlt z7.h, z1.b, z3.b \n" /* U */ \ - "addp " #zU0 ".h, p0/m, " #zU0 ".h, " #zV0 ".h \n" /* UV */ \ - "addp " #zU1 ".h, p1/m, " #zU1 ".h, " #zV1 ".h \n" /* UV */ \ - "subs %w[width], %w[width], %w[vl] \n" \ - "urshr " #zU0 ".h, p0/m, " #zU0 ".h, #2 \n" /* U0V0 */ \ - "urshr " #zU1 ".h, p1/m, " #zU1 ".h, #2 \n" /* U0V0 */ \ - "st1b {" #zU0 ".h}, p0, [%[dst]] \n" \ - "st1b {" #zU1 ".h}, p1, [%[dst], #1, mul vl] \n" \ - "incb %[dst] \n" -// clang-format on - -// Filter 2 rows of AYUV UV's (444) into UV (420). -// AYUV is VUYA in memory. UV for NV12 is UV order in memory. -void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - // Output a row of UV values, filtering 2x2 rows of AYUV. - const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; - int vl; - asm volatile ( - "cntb %x[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - "ptrue p0.h \n" - "ptrue p1.h \n" - "1: \n" - AYUVTOUV_SVE(z5, z4, z7, z6) - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - "cnth %x[vl] \n" - "whilelt p0.h, wzr, %w[width] \n" // first row - "whilelt p1.h, %w[vl], %w[width] \n" // second row - AYUVTOUV_SVE(z5, z4, z7, z6) - - "99: \n" - : [src0]"+r"(src_ayuv), // %[src0] - [src1]"+r"(src_ayuv1), // %[src1] - [dst]"+r"(dst_uv), // %[dst] - [width]"+r"(width), // %[width] - [vl]"=&r"(vl) // %[vl] - : - : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", - "p1"); -} - -// Filter 2 rows of AYUV UV's (444) into VU (420). -void AYUVToVURow_SVE2(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - // Output a row of VU values, filtering 2x2 rows of AYUV. - const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; - int vl; - asm volatile ( - "cntb %x[vl] \n" - "cmp %w[width], %w[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - "ptrue p0.h \n" - "ptrue p1.h \n" - "1: \n" - AYUVTOUV_SVE(z4, z5, z6, z7) - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - "cnth %x[vl] \n" - "whilelt p0.h, wzr, %w[width] \n" // first row - "whilelt p1.h, %w[vl], %w[width] \n" // second row - AYUVTOUV_SVE(z4, z5, z6, z7) - - "99: \n" - : [src0]"+r"(src_ayuv), // %[src0] - [src1]"+r"(src_ayuv1), // %[src1] - [dst]"+r"(dst_vu), // %[dst] - [width]"+r"(width), // %[width] - [vl]"=&r"(vl) // %[vl] - : - : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", - "p1"); -} - -void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint32_t nv_uv_start = 0x03010301U; - uint32_t nv_uv_step = 0x04040404U; - uint64_t vl; - asm("cnth %0" : "=r"(vl)); - int width_last_y = width & (vl - 1); - int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile( - "ptrue p0.b \n" - "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" - "dup z19.b, #255 \n" // A - YUVTORGB_SVE_SETUP - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "ptrue p2.h \n" - "1: \n" // - READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "whilelt p1.h, wzr, %w[width_last_y] \n" - "whilelt p2.h, wzr, %w[width_last_uv] \n" // - READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [vl] "r"(vl), // %[vl] - [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] - [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] - [width_last_y] "r"(width_last_y), // %[width_last_y] - [width_last_uv] "r"(width_last_uv) // %[width_last_uv] - : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); -} - -void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - uint32_t nv_uv_start = 0x02000200U; - uint32_t nv_uv_step = 0x04040404U; - uint64_t vl; - asm("cnth %0" : "=r"(vl)); - int width_last_y = width & (vl - 1); - int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile( - "ptrue p0.b \n" - "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" - "dup z19.b, #255 \n" // A - YUVTORGB_SVE_SETUP - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with an all-true predicate to avoid predicate - // generation overhead. - "ptrue p1.h \n" - "ptrue p2.h \n" - "1: \n" // - READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE - "subs %w[width], %w[width], %w[vl] \n" - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "2: \n" - "whilelt p1.h, wzr, %w[width_last_y] \n" - "whilelt p2.h, wzr, %w[width_last_uv] \n" // - READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE - "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" - - "99: \n" - : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [vl] "r"(vl), // %[vl] - [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] - [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] - [width_last_y] "r"(width_last_y), // %[width_last_y] - [width_last_uv] "r"(width_last_uv) // %[width_last_uv] - : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); -} - -static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw, - uint8_t* dst_wxyz, - int width, - uint32_t idx_start, - uint32_t idx_step, - uint32_t alpha) { - uint32_t vl; - asm("cntw %x0" : "=r"(vl)); - uint32_t vl_mul3 = vl * 3; - uint32_t rem_mul3; - asm volatile( - "index z31.s, %w[idx_start], %w[idx_step] \n" - "dup z30.s, %w[alpha] \n" - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "b.lt 2f \n" - - // Run bulk of computation with the same predicates to avoid predicate - // generation overhead. We set up p1 to only load 3/4 of a vector. - "ptrue p0.s \n" - "whilelt p1.b, wzr, %w[vl_mul3] \n" - "1: \n" - "ld1b {z0.b}, p1/z, [%[src]] \n" - "add %[src], %[src], %x[vl_mul3] \n" - "ld1b {z1.b}, p1/z, [%[src]] \n" - "add %[src], %[src], %x[vl_mul3] \n" - "tbl z0.b, {z0.b}, z31.b \n" - "tbl z1.b, {z1.b}, z31.b \n" - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "orr z0.d, z0.d, z30.d \n" - "orr z1.d, z1.d, z30.d \n" - "st1w {z0.s}, p0, [%[dst]] \n" - "st1w {z1.s}, p0, [%[dst], #1, mul vl] \n" - "incb %[dst], all, mul #2 \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl], lsl #1 \n" - "b.eq 99f \n" - - // Calculate a pair of predicates for the final iteration to deal with - // the tail. - "3: \n" - "add %w[rem_mul3], %w[width], %w[width], lsl #1 \n" - "whilelt p0.s, wzr, %w[width] \n" - "whilelt p1.b, wzr, %w[rem_mul3] \n" - "ld1b {z0.b}, p1/z, [%[src]] \n" - "add %[src], %[src], %x[vl_mul3] \n" - "tbl z0.b, {z0.b}, z31.b \n" - "subs %w[width], %w[width], %w[vl] \n" - "orr z0.d, z0.d, z30.d \n" - "st1w {z0.s}, p0, [%[dst]] \n" - "incb %[dst] \n" - "b.gt 3b \n" - - "99: \n" - : [src] "+r"(src_raw), // %[src] - [dst] "+r"(dst_wxyz), // %[dst] - [width] "+r"(width), // %[width] - [vl_mul3] "+r"(vl_mul3), // %[vl_mul3] - [rem_mul3] "=&r"(rem_mul3) // %[rem_mul3] - : [idx_start] "r"(idx_start), // %[idx_start] - [idx_step] "r"(idx_step), // %[idx_step] - [alpha] "r"(alpha), // %[alpha] - [vl] "r"(vl) // %[vl] - : "cc", "memory", "z0", "z1", "z30", "z31", "p0", "p1"); -} - -void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U, - 0xff000000U); -} - -void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U, - 0x000000ffU); -} - -void RGB24ToARGBRow_SVE2(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - RAWToWXYZRow_SVE2(src_rgb24, dst_argb, width, 0xff020100U, 0x00030303U, - 0xff000000U); -} - -static const uint8_t kRAWToRGB24Indices[] = { - 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, - 17, 16, 15, 20, 19, 18, 23, 22, 21, 26, 25, 24, 29, 28, 27, - 32, 31, 30, 35, 34, 33, 38, 37, 36, 41, 40, 39, 44, 43, 42, - 47, 46, 45, 50, 49, 48, 53, 52, 51, 56, 55, 54, 59, 58, 57, - 62, 61, 60, 65, 64, 63, 68, 67, 66, 71, 70, 69, 74, 73, 72, - 77, 76, 75, 80, 79, 78, 83, 82, 81, 86, 85, 84, 89, 88, 87, - 92, 91, 90, 95, 94, 93, 98, 97, 96, 101, 100, 99, 104, 103, 102, - 107, 106, 105, 110, 109, 108, 113, 112, 111, 116, 115, 114, 119, 118, 117, - 122, 121, 120, 125, 124, 123, 128, 127, 126, 131, 130, 129, 134, 133, 132, - 137, 136, 135, 140, 139, 138, 143, 142, 141, 146, 145, 144, 149, 148, 147, - 152, 151, 150, 155, 154, 153, 158, 157, 156, 161, 160, 159, 164, 163, 162, - 167, 166, 165, 170, 169, 168, 173, 172, 171, 176, 175, 174, 179, 178, 177, - 182, 181, 180, 185, 184, 183, 188, 187, 186, 191, 190, 189, 194, 193, 192, - 197, 196, 195, 200, 199, 198, 203, 202, 201, 206, 205, 204, 209, 208, 207, - 212, 211, 210, 215, 214, 213, 218, 217, 216, 221, 220, 219, 224, 223, 222, - 227, 226, 225, 230, 229, 228, 233, 232, 231, 236, 235, 234, 239, 238, 237, - 242, 241, 240, 245, 244, 243, 248, 247, 246, 251, 250, 249, 254, 253, 252}; - -void RAWToRGB24Row_SVE2(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - // width is in elements, convert to bytes. - width *= 3; - // we use the mul3 predicate pattern throughout to use the largest multiple - // of three number of lanes, for instance with a vector length of 16 bytes - // only the first 15 bytes will be used for load/store instructions. - uint32_t vl; - asm volatile( - "cntb %x[vl], mul3 \n" - "ptrue p0.b, mul3 \n" - "ld1b {z31.b}, p0/z, [%[kIndices]] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" - - // Run bulk of computation with the same predicate to avoid predicate - // generation overhead. - "1: \n" - "ld1b {z0.b}, p0/z, [%[src]] \n" - "add %[src], %[src], %x[vl] \n" - "tbl z0.b, {z0.b}, z31.b \n" - "subs %w[width], %w[width], %w[vl] \n" - "st1b {z0.b}, p0, [%[dst]] \n" - "add %[dst], %[dst], %x[vl] \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" - "b.eq 99f \n" - - // Calculate a predicate for the final iteration to deal with the tail. - "whilelt p0.b, wzr, %w[width] \n" - "ld1b {z0.b}, p0/z, [%[src]] \n" - "tbl z0.b, {z0.b}, z31.b \n" - "st1b {z0.b}, p0, [%[dst]] \n" - - "99: \n" - : [src] "+r"(src_raw), // %[src] - [dst] "+r"(dst_rgb24), // %[dst] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [kIndices] "r"(kRAWToRGB24Indices) // %[kIndices] - : "cc", "memory", "z0", "z31", "p0"); -} - -static inline void ARGBToXYZRow_SVE2(const uint8_t* src_argb, - uint8_t* dst_xyz, - int width, - const uint8_t* indices) { - uint32_t vl; - asm("cntw %x0" : "=r"(vl)); - uint32_t vl_mul3 = vl * 3; - uint32_t rem_mul3; - asm volatile( - "whilelt p1.b, wzr, %w[vl_mul3] \n" - "ld1b {z31.b}, p1/z, [%[indices]] \n" - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "b.lt 2f \n" - - // Run bulk of computation with the same predicates to avoid predicate - // generation overhead. We set up p1 to only store 3/4 of a vector. - "ptrue p0.s \n" - "1: \n" - "ld1w {z0.s}, p0/z, [%[src]] \n" - "ld1w {z1.s}, p0/z, [%[src], #1, mul vl] \n" - "incb %[src], all, mul #2 \n" - "tbl z0.b, {z0.b}, z31.b \n" - "tbl z1.b, {z1.b}, z31.b \n" - "subs %w[width], %w[width], %w[vl], lsl #1 \n" - "st1b {z0.b}, p1, [%[dst]] \n" - "add %[dst], %[dst], %x[vl_mul3] \n" - "st1b {z1.b}, p1, [%[dst]] \n" - "add %[dst], %[dst], %x[vl_mul3] \n" - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl], lsl #1 \n" - "b.eq 99f \n" - - // Calculate predicates for the final iteration to deal with the tail. - "add %w[rem_mul3], %w[width], %w[width], lsl #1 \n" - "whilelt p0.s, wzr, %w[width] \n" - "whilelt p1.b, wzr, %w[rem_mul3] \n" - "whilelt p2.s, %w[vl], %w[width] \n" - "whilelt p3.b, %w[vl_mul3], %w[rem_mul3] \n" - "ld1w {z0.s}, p0/z, [%[src]] \n" - "ld1w {z1.s}, p2/z, [%[src], #1, mul vl] \n" - "tbl z0.b, {z0.b}, z31.b \n" - "tbl z1.b, {z1.b}, z31.b \n" - "st1b {z0.b}, p1, [%[dst]] \n" - "add %[dst], %[dst], %x[vl_mul3] \n" - "st1b {z1.b}, p3, [%[dst]] \n" - - "99: \n" - : [src] "+r"(src_argb), // %[src] - [dst] "+r"(dst_xyz), // %[dst] - [width] "+r"(width), // %[width] - [rem_mul3] "=&r"(rem_mul3) // %[rem_mul3] - : [indices] "r"(indices), // %[indices] - [vl_mul3] "r"(vl_mul3), // %[vl_mul3] - [vl] "r"(vl) // %[vl] - : "cc", "memory", "z0", "z1", "z31", "p0", "p1", "p2", "p3"); -} - -static const uint8_t kARGBToRGB24RowIndices[] = { - 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, - 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, - 40, 41, 42, 44, 45, 46, 48, 49, 50, 52, 53, 54, 56, 57, 58, - 60, 61, 62, 64, 65, 66, 68, 69, 70, 72, 73, 74, 76, 77, 78, - 80, 81, 82, 84, 85, 86, 88, 89, 90, 92, 93, 94, 96, 97, 98, - 100, 101, 102, 104, 105, 106, 108, 109, 110, 112, 113, 114, 116, 117, 118, - 120, 121, 122, 124, 125, 126, 128, 129, 130, 132, 133, 134, 136, 137, 138, - 140, 141, 142, 144, 145, 146, 148, 149, 150, 152, 153, 154, 156, 157, 158, - 160, 161, 162, 164, 165, 166, 168, 169, 170, 172, 173, 174, 176, 177, 178, - 180, 181, 182, 184, 185, 186, 188, 189, 190, 192, 193, 194, 196, 197, 198, - 200, 201, 202, 204, 205, 206, 208, 209, 210, 212, 213, 214, 216, 217, 218, - 220, 221, 222, 224, 225, 226, 228, 229, 230, 232, 233, 234, 236, 237, 238, - 240, 241, 242, 244, 245, 246, 248, 249, 250, 252, 253, 254, -}; - -static const uint8_t kARGBToRAWRowIndices[] = { - 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, - 22, 21, 20, 26, 25, 24, 30, 29, 28, 34, 33, 32, 38, 37, 36, - 42, 41, 40, 46, 45, 44, 50, 49, 48, 54, 53, 52, 58, 57, 56, - 62, 61, 60, 66, 65, 64, 70, 69, 68, 74, 73, 72, 78, 77, 76, - 82, 81, 80, 86, 85, 84, 90, 89, 88, 94, 93, 92, 98, 97, 96, - 102, 101, 100, 106, 105, 104, 110, 109, 108, 114, 113, 112, 118, 117, 116, - 122, 121, 120, 126, 125, 124, 130, 129, 128, 134, 133, 132, 138, 137, 136, - 142, 141, 140, 146, 145, 144, 150, 149, 148, 154, 153, 152, 158, 157, 156, - 162, 161, 160, 166, 165, 164, 170, 169, 168, 174, 173, 172, 178, 177, 176, - 182, 181, 180, 186, 185, 184, 190, 189, 188, 194, 193, 192, 198, 197, 196, - 202, 201, 200, 206, 205, 204, 210, 209, 208, 214, 213, 212, 218, 217, 216, - 222, 221, 220, 226, 225, 224, 230, 229, 228, 234, 233, 232, 238, 237, 236, - 242, 241, 240, 246, 245, 244, 250, 249, 248, 254, 253, 252, -}; - -void ARGBToRGB24Row_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRGB24RowIndices); -} - -void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices); -} - -#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/row_win.c b/drivers/media/pci/tbscapture2/row_win.c deleted file mode 100644 index a078dd8b5e6b..000000000000 --- a/drivers/media/pci/tbscapture2/row_win.c +++ /dev/null @@ -1,6440 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -// This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) - -#if defined(_M_ARM64EC) -#include -#elif defined(_M_X64) -#include -#include // For _mm_maddubs_epi16 -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// 64 bit -#if defined(_M_X64) - -// Read 8 UV from 444 -#define READYUV444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 8 UV from 444, With 8 Alpha. -#define READYUVA444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ - xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ - xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ - xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ - xmm0 = _mm_adds_epi16(xmm4, xmm0); \ - xmm1 = _mm_subs_epi16(xmm4, xmm1); \ - xmm2 = _mm_adds_epi16(xmm4, xmm2); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); - -// Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ - dst_argb += 32; - -#if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -// 32 bit -#else // defined(_M_X64) -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Constants for ARGB. -static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; - -// JPeg full range. -static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0}; - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; - -static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; - -static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; - -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; - -// Constants for BGRA. -static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, - 0, 33, 65, 13, 0, 33, 65, 13}; - -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - -// Constants for ABGR. -static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, - 33, 65, 13, 0, 33, 65, 13, 0}; - -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - -// Constants for RGBA. -static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, - 0, 13, 65, 33, 0, 13, 65, 33}; - -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - -static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; - -// 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; - -// 8 bit fixed point 0.5, for bias of UV. -static const ulvec8 kBiasUV128 = { - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Shuffle table for converting RGB24 to ARGB. -static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; - -// Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; - -// Shuffle table for converting RAW to RGB24. First 8. -static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Middle 8. -static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Last 8. -static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RGB24. -static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RAW. -static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; - -// YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, - 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, - 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - -// YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, - 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, - 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; - -// UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, - 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, - 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - -// UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, - 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, - 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; - -// NV21 shuf 8 VU to 16 UV. -static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, -}; - -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_J400TOARGBROW_AVX2 -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 - vpunpcklbw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - vpunpckhwd ymm1, ymm0, ymm0 - vpunpcklwd ymm0, ymm0, ymm0 - vpor ymm0, ymm0, ymm5 - vpor ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_J400TOARGBROW_AVX2 - -__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 - mov ecx, [esp + 12] // width - movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 - movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 4] - movdqu xmm2, [eax + 8] - lea eax, [eax + 24] - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 8 - jg convertloop - ret - } -} - -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -// 20 instructions. -__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green - psllw xmm4, 10 - psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_RGB565TOARGBROW_AVX2 -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green - vpsllw ymm4, ymm4, 10 - vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_RGB565TOARGBROW_AVX2 - -#ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) - vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB1555TOARGBROW_AVX2 - -#ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles - vpsrlw ymm3, ymm2, 4 - vpsllw ymm1, ymm0, 4 - vpor ymm2, ymm2, ymm3 - vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm2, ymm2, 0xd8 - vpunpckhbw ymm1, ymm0, ymm2 - vpunpcklbw ymm0, ymm0, ymm2 - vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB4444TOARGBROW_AVX2 - -// 24 instructions -__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green - psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) - pand xmm2, xmm7 - por xmm0, xmm2 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -// 18 instructions. -__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - movd xmm4, eax - pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles - pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 - movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - psllw xmm1, 4 - psrlw xmm3, 4 - por xmm0, xmm1 - por xmm2, xmm3 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - __asm { - - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes - movdqa xmm7, xmm6 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes - vpermq ymm6, ymm6, 0xd8 - vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565DITHERROW_AVX2 - -// TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f - psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 - pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 - pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 - pslld xmm7, 15 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 - psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 - psrlw xmm3, 8 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble - psrld xmm0, 4 - psrld xmm1, 8 - por xmm0, xmm1 - packuswb xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565ROW_AVX2 - -#ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 - vpslld ymm7, ymm7, 15 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA - vpackssdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB1555ROW_AVX2 - -#ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 - vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble - vpsrld ymm1, ymm1, 8 - vpsrld ymm0, ymm0, 4 - vpor ymm0, ymm0, ymm1 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB4444ROW_AVX2 - -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 // Add .5 for rounding. - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTOYROW_AVX2 -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; - -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToY - vbroadcastf128 ymm5, xmmword ptr kAddY16 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 // add 16 for Y - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYROW_AVX2 - -#ifdef HAS_ARGBTOYJROW_AVX2 -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToYJ - vbroadcastf128 ymm5, xmmword ptr kAddYJ64 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. - vpaddw ymm2, ymm2, ymm5 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYJROW_AVX2 - -__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kBGRAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kABGRToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kRGBAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToVJ - movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToV - vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - vpaddb ymm0, ymm0, ymm5 // -> unsigned - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToVJ - vbroadcastf128 ymm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned - vpaddw ymm0, ymm0, ymm5 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVJROW_AVX2 - -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - - movdqu xmm0, [eax] // V - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqu [edx + edi], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kBGRAToV - movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kABGRToV - movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kRGBAToV - movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBTOYROW_SSSE3 - -// Read 16 UV from 444 -#define READYUV444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 16 UV from 444. With 16 Alpha. -#define READYUVA444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32]} - -// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32]} - -// Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ - __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ - __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ - __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ - __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ - __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ - __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ - __asm vpaddw ymm4, ymm3, ymm4 \ - __asm vpaddsw ymm0, ymm0, ymm4 \ - __asm vpsubsw ymm1, ymm4, ymm1 \ - __asm vpaddsw ymm2, ymm2, ymm4 \ - __asm vpsraw ymm0, ymm0, 6 \ - __asm vpsraw ymm1, ymm1, 6 \ - __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 \ - __asm vpackuswb ymm1, ymm1, ymm1 \ - __asm vpackuswb ymm2, ymm2, ymm2} - -// Store 16 ARGB values. -#define STOREARGB_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ - __asm vmovdqu 0[edx], ymm1 \ - __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64]} - -// Store 16 RGBA values. -#define STORERGBA_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ - __asm vmovdqu [edx], ymm0 \ - __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64]} - -#ifdef HAS_I422TOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I422ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TOARGBROW_AVX2 - -#ifdef HAS_I422ALPHATOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) void I422AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422ALPHATOARGBROW_AVX2 - -#ifdef HAS_I444TOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - convertloop: - READYUV444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444TOARGBROW_AVX2 - -#ifdef HAS_I444ALPHATOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - convertloop: - READYUVA444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444AlphaTOARGBROW_AVX2 - -#ifdef HAS_NV12TOARGBROW_AVX2 -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV12ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV12_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV12TOARGBROW_AVX2 - -#ifdef HAS_NV21TOARGBROW_AVX2 -// 16 pixels. -// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV21ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV21_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV21TOARGBROW_AVX2 - -#ifdef HAS_YUY2TOARGBROW_AVX2 -// 16 pixels. -// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void YUY2ToARGBRow_AVX2( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUY2_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_YUY2TOARGBROW_AVX2 - -#ifdef HAS_UYVYTOARGBROW_AVX2 -// 16 pixels. -// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void UYVYToARGBRow_AVX2( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READUYVY_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_UYVYTOARGBROW_AVX2 - -#ifdef HAS_I422TORGBAROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) void I422ToRGBARow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STORERGBA_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TORGBAROW_AVX2 - -#if defined(HAS_I422TOARGBROW_SSSE3) -// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. -// Allows a conversion with half size scaling. - -// Read 8 UV from 444. -#define READYUV444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 444. With 8 Alpha. -#define READYUVA444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 \ - __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ - __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16]} - -// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY \ - __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ - __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16]} - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) \ - __asm { \ - __asm psubb xmm3, xmmword ptr kBiasUV128 \ - __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ - __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ - __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ - __asm pmaddubsw xmm0, xmm3 \ - __asm pmaddubsw xmm1, xmm3 \ - __asm pmaddubsw xmm2, xmm3 \ - __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ - __asm paddw xmm4, xmm3 \ - __asm paddsw xmm0, xmm4 \ - __asm paddsw xmm2, xmm4 \ - __asm psubsw xmm4, xmm1 \ - __asm movdqa xmm1, xmm4 \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ - } - -// Store 8 ARGB values. -#define STOREARGB \ - __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm0 \ - __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32]} - -// Store 8 BGRA values. -#define STOREBGRA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGBA values. -#define STORERGBA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGB24 values. -#define STORERGB24 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24]} - -// Store 8 RGB565 values. -#define STORERGB565 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ - __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16]} - -// 8 pixels. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I444ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). -__declspec(naked) void I444AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) void I422ToRGB24Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB24 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) void I444ToRGB24Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - READYUV444 - YUVTORGB(ebx) - STORERGB24 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) void I422ToRGB565Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f - psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 - psrld xmm6, 26 - pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 - pslld xmm7, 11 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB565 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I422ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) void I422AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV12ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV12 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV21ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV21 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void YUY2ToARGBRow_SSSE3( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUY2 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -// 8 pixels. -// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void UYVYToARGBRow_SSSE3( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READUYVY - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -__declspec(naked) void I422ToRGBARow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGBA - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} -#endif // HAS_I422TOARGBROW_SSSE3 - -// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter -#ifdef HAS_I400TOARGBROW_SSE2 -// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - movd xmm2, eax - pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - movd xmm3, eax - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y - pmulhuw xmm0, xmm2 - psubusw xmm0, xmm3 - psrlw xmm0, 6 - packuswb xmm0, xmm0 // G - - // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels - por xmm0, xmm4 - por xmm1, xmm4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_I400TOARGBROW_SSE2 - -#ifdef HAS_I400TOARGBROW_AVX2 -// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). -// note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - vmovd xmm2, eax - vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - vmovd xmm3, eax - vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 - vpslld ymm4, ymm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y - vpmulhuw ymm0, ymm0, ymm2 - vpsubusw ymm0, ymm0, ymm3 - vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates - vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels - vpor ymm0, ymm0, ymm4 - vpor ymm1, ymm1, ymm4 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_I400TOARGBROW_AVX2 - -#ifdef HAS_MIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -// TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - movdqa xmm5, xmmword ptr kShuffleMirror - - convertloop: - movdqu xmm0, [eax - 16 + ecx] - pshufb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} -#endif // HAS_MIRRORROW_SSSE3 - -#ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vbroadcastf128 ymm5, xmmword ptr kShuffleMirror - - convertloop: - vmovdqu ymm0, [eax - 32 + ecx] - vpshufb ymm0, ymm0, ymm5 - vpermq ymm0, ymm0, 0x4e // swap high and low halfs - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_MIRRORROW_AVX2 - -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - -__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm1, xmmword ptr kShuffleMirrorUV - lea eax, [eax + ecx * 2 - 16] - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm1 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [edx + edi], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MIRRORSPLITUVROW_SSSE3 - -#ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - lea eax, [eax - 16 + ecx * 4] // last 4 pixels. - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufd xmm0, xmm0, 0x1b - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBMIRRORROW_SSE2 - -#ifdef HAS_ARGBMIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 - - convertloop: - vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBMIRRORROW_AVX2 - -#ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [edx], xmm0 - movdqu [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -#endif // HAS_SPLITUVROW_SSE2 - -#ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes - vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpackuswb ymm2, ymm2, ymm3 - vpermq ymm0, ymm0, 0xd8 - vpermq ymm2, ymm2, 0xd8 - vmovdqu [edx], ymm0 - vmovdqu [edx + edi], ymm2 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_SPLITUVROW_AVX2 - -#ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // read 16 U's - movdqu xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqu [edi], xmm0 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MERGEUVROW_SSE2 - -#ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - vpmovzxbw ymm0, [eax] - vpmovzxbw ymm1, [eax + edx] - lea eax, [eax + 16] - vpsllw ymm1, ymm1, 8 - vpor ymm2, ymm1, ymm0 - vmovdqu [edi], ymm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_MERGEUVROW_AVX2 - -#ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) void CopyRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - test eax, 15 - jne convertloopu - test edx, 15 - jne convertloopu - - convertloopa: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopa - ret - - convertloopu: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopu - ret - } -} -#endif // HAS_COPYROW_SSE2 - -#ifdef HAS_COPYROW_AVX -// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) void CopyRow_AVX(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 64 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_COPYROW_AVX - -// Multiple of 1. -__declspec(naked) void CopyRow_ERMS(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // width - rep movsb - mov edi, edx - mov esi, eax - ret - } -} - -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movdqu xmm2, [eax] - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + 32] - lea eax, [eax + 64] - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_AVX2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - - extractloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrld xmm0, 24 - psrld xmm1, 24 - packssdw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg extractloop - - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX - - extractloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpsrld ymm0, ymm0, 24 - vpsrld ymm1, ymm1, 24 - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - lea eax, [eax + 128] - vpackssdw ymm0, ymm0, ymm1 // mutates - vpsrld ymm2, ymm2, 24 - vpsrld ymm3, ymm3, 24 - vpackssdw ymm2, ymm2, ymm3 // mutates - vpackuswb ymm0, ymm0, ymm2 // mutates - vpermd ymm0, ymm4, ymm0 // unmutate - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg extractloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movq xmm2, qword ptr [eax] // 8 Y's - lea eax, [eax + 8] - punpcklbw xmm2, xmm2 - punpckhwd xmm3, xmm2 - punpcklwd xmm2, xmm2 - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vpmovzxbd ymm1, qword ptr [eax] - vpmovzxbd ymm2, qword ptr [eax + 8] - lea eax, [eax + 16] - vpslld ymm1, ymm1, 24 - vpslld ymm2, ymm2, 24 - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 - -#ifdef HAS_SETROW_X86 -// Write 'width' bytes using an 8 bit value repeated. -// width should be multiple of 4. -__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { - __asm { - movzx eax, byte ptr [esp + 8] // v8 - mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. - mov edx, edi - mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // width - shr ecx, 2 - rep stosd - mov edi, edx - ret - } -} - -// Write 'width' bytes using an 8 bit value repeated. -__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // width - rep stosb - mov edi, edx - ret - } -} - -// Write 'width' 32 bit values. -__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, - uint32_t v32, - int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // width - rep stosd - mov edi, edx - ret - } -} -#endif // HAS_SETROW_X86 - -#ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_YUY2TOYROW_AVX2 - -#ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_YUY2TOYROW_SSE2 - -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm6, eax - pshufd xmm6, xmm6, 0x00 - - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - movd xmm7, eax - pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 8 pixel loop. - convertloop8: - movq xmm0, qword ptr [esi] // alpha - punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a - movq xmm1, qword ptr [eax + esi] // src0 - movq xmm2, qword ptr [edx + esi] // src1 - punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 - pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edi + esi], xmm0 - lea esi, [esi + 8] - sub ecx, 8 - jg convertloop8 - - pop edi - pop esi - ret - } -} -#endif // HAS_BLENDPLANEROW_SSSE3 - -#ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 32 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 - vpsllw ymm5, ymm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - vmovd xmm7, eax - vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 32 pixel loop. - convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a - vmovdqu ymm1, [eax + esi] // src0 - vmovdqu ymm2, [edx + esi] // src1 - vpunpckhbw ymm4, ymm1, ymm2 - vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 - vpmaddubsw ymm3, ymm3, ymm4 - vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. - vpsrlw ymm3, ymm3, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm3 - vmovdqu [edi + esi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg convertloop32 - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_BLENDPLANEROW_AVX2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; - -// Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - sub ecx, 4 - jl convertloop4b // less than 4 pixels? - - // 4 pixel loop. - convertloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop4b: - add ecx, 4 - 1 - jl convertloop1b - - // 1 pixel loop. - convertloop1: - movd xmm3, [eax] // src argb - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop1b: - pop esi - ret - } -} -#endif // HAS_ARGBBLENDROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, -}; -static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, -}; -__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 - pslld xmm3, 24 - movdqa xmm4, xmmword ptr kShuffleAlpha0 - movdqa xmm5, xmmword ptr kShuffleAlpha1 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha - lea eax, [eax + 16] - pand xmm2, xmm3 - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBATTENUATEROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; -__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpshufb ymm2, ymm0, ymm4 // low 4 alphas - vpshufb ymm3, ymm1, ymm4 // high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * a - vpmulhuw ymm1, ymm1, ymm3 // rgb * a - vpand ymm6, ymm6, ymm5 // isolate alpha - vpsrlw ymm0, ymm0, 8 - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vpor ymm0, ymm0, ymm6 // copy original alpha - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - lea ebx, fixed_invtbl8 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 3] // first alpha - movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a - - movdqu xmm1, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 11] // third alpha - movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a - lea eax, [eax + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 - -#ifdef HAS_ARGBUNATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; -// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. -// USE_GATHER is not on by default, due to being a slow instruction. -#ifdef USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#else // USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - sub edx, eax - lea ebx, fixed_invtbl8 - vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] - // end of VPGATHER - - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - vzeroupper - ret - } -} -#endif // USE_GATHER -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // Add .5 for rounding. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes - movdqu xmm2, [eax] // A - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - psrld xmm2, 24 - psrld xmm3, 24 - packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -// Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, - 17, 68, 35, 0, 17, 68, 35, 0}; - -static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, - 22, 88, 45, 0, 22, 88, 45, 0}; - -static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, - 24, 98, 50, 0, 24, 98, 50, 0}; - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ - movdqa xmm2, xmmword ptr kARGBToSepiaB - movdqa xmm3, xmmword ptr kARGBToSepiaG - movdqa xmm4, xmmword ptr kARGBToSepiaR - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm6, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm6, xmm2 - phaddw xmm0, xmm6 - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values - movdqu xmm5, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm3 - pmaddubsw xmm1, xmm3 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values - movdqu xmm5, [eax] // R - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values - movdqu xmm6, [eax] // A - movdqu xmm1, [eax + 16] - psrld xmm6, 24 - psrld xmm1, 24 - packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 - movdqu [eax], xmm0 - movdqu [eax + 16], xmm1 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBSEPIAROW_SSSE3 - -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R -// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ - movdqu xmm5, [ecx] - pshufd xmm2, xmm5, 0x00 - pshufd xmm3, xmm5, 0x55 - pshufd xmm4, xmm5, 0xaa - pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm7, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm7, xmm2 - movdqu xmm6, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm6, xmm3 - pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values - movdqu xmm1, [eax] // R - movdqu xmm7, [eax + 16] - pmaddubsw xmm1, xmm4 - pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R - movdqu xmm6, [eax] // A - movdqu xmm7, [eax + 16] - pmaddubsw xmm6, xmm5 - pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm6 - lea eax, [eax + 32] - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 - -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ - pshuflw xmm2, xmm2, 040h - pshufd xmm2, xmm2, 044h - pshuflw xmm3, xmm3, 040h - pshufd xmm3, xmm3, 044h - pshuflw xmm4, xmm4, 040h - pshufd xmm4, xmm4, 044h - pxor xmm5, xmm5 // constant 0 - pcmpeqb xmm6, xmm6 // generate mask 0xff000000 - pslld xmm6, 24 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels - pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size - movdqu xmm7, [eax] // read 4 pixels - pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 - paddw xmm1, xmm4 - packuswb xmm0, xmm1 - por xmm0, xmm7 - movdqu [eax], xmm0 - lea eax, [eax + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - movd xmm2, [esp + 16] // value - punpcklbw xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pxor xmm5, xmm5 // constant 0 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - movdqu xmm2, [esi] // read 4 pixels from src_argb1 - movdqu xmm1, xmm0 - movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 - lea eax, [eax + 16] - lea esi, [esi + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 - -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -// TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - sub ecx, 4 - jl convertloop49 - - convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop49: - add ecx, 4 - 1 - jl convertloop19 - - convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb - lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 - lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop19: - pop esi - ret - } -} -#endif // HAS_ARGBADDROW_SSE2 - -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb - src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 - - convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 - lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 - vpackuswb ymm0, ymm0, ymm1 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_AVX2 - -#ifdef HAS_ARGBADDROW_AVX2 -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBADDROW_AVX2 - -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_AVX2 - -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 - mov edi, [esp + 8 + 12] // src_y2 - mov edx, [esp + 8 + 16] // dst_sobelx - mov ecx, [esp + 8 + 20] // width - sub esi, eax - sub edi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] - movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_SOBELXROW_SSE2 - -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 - mov edx, [esp + 4 + 12] // dst_sobely - mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] - movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELYROW_SSE2 - -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA - por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA - por xmm0, xmm5 - movdqu [edx], xmm1 - movdqu [edx + 16], xmm2 - movdqu [edx + 32], xmm3 - movdqu [edx + 48], xmm0 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELROW_SSE2 - -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA - punpcklbw xmm3, xmm5 - punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS - punpcklbw xmm4, xmm2 - punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 - movdqu [edx], xmm6 - movdqu [edx + 16], xmm4 - movdqu [edx + 32], xmm7 - movdqu [edx + 48], xmm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -// Consider float CumulativeSum. -// Consider calling CumulativeSum one row at time as needed. -// Consider circular CumulativeSum buffer of radius * 2 + 1 height. -// Convert cumulative sum for an area to an average for 1 pixel. -// topleft is pointer to top left of CumulativeSum buffer for area. -// botleft is pointer to bottom left of CumulativeSum buffer. -// width is offset from left to right of area in CumulativeSum buffer measured -// in number of ints. -// area is the number of pixels in the area being averaged. -// dst points to pixel to store result to. -// count is number of averaged pixels to produce. -// Does 4 pixels at a time. -// This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - __asm { - mov eax, topleft // eax topleft - mov esi, botleft // esi botleft - mov edx, width - movd xmm5, area - mov edi, dst - mov ecx, count - cvtdq2ps xmm5, xmm5 - rcpss xmm4, xmm5 // 1.0f / area - pshufd xmm4, xmm4, 0 - sub ecx, 4 - jl l4b - - cmp area, 128 // 128 pixels will not overflow 15 bits. - ja l4 - - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 - psrld xmm6, 16 - cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts - - // 4 pixel loop small blocks. - s4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - packssdw xmm0, xmm1 // pack 4 pixels into 2 registers - packssdw xmm2, xmm3 - - pmulhuw xmm0, xmm5 - pmulhuw xmm2, xmm5 - - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge s4 - - jmp l4b - - // 4 pixel loop - l4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area - cvtdq2ps xmm1, xmm1 - mulps xmm0, xmm4 - mulps xmm1, xmm4 - cvtdq2ps xmm2, xmm2 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - cvtps2dq xmm0, xmm0 - cvtps2dq xmm1, xmm1 - cvtps2dq xmm2, xmm2 - cvtps2dq xmm3, xmm3 - packssdw xmm0, xmm1 - packssdw xmm2, xmm3 - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movdqu xmm0, [eax] - psubd xmm0, [eax + edx * 4] - lea eax, [eax + 16] - psubd xmm0, [esi] - paddd xmm0, [esi + edx * 4] - lea esi, [esi + 16] - cvtdq2ps xmm0, xmm0 - mulps xmm0, xmm4 - cvtps2dq xmm0, xmm0 - packssdw xmm0, xmm0 - packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] - sub ecx, 1 - jge l1 - l1b: - } -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - __asm { - mov eax, row - mov edx, cumsum - mov esi, previous_cumsum - mov ecx, width - pxor xmm0, xmm0 - pxor xmm1, xmm1 - - sub ecx, 4 - jl l4b - test edx, 15 - jne l4b - - // 4 pixel loop - l4: - movdqu xmm2, [eax] // 4 argb pixels 16 bytes. - lea eax, [eax + 16] - movdqa xmm4, xmm2 - - punpcklbw xmm2, xmm1 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm1 - punpckhwd xmm3, xmm1 - - punpckhbw xmm4, xmm1 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm1 - punpckhwd xmm5, xmm1 - - paddd xmm0, xmm2 - movdqu xmm2, [esi] // previous row above. - paddd xmm2, xmm0 - - paddd xmm0, xmm3 - movdqu xmm3, [esi + 16] - paddd xmm3, xmm0 - - paddd xmm0, xmm4 - movdqu xmm4, [esi + 32] - paddd xmm4, xmm0 - - paddd xmm0, xmm5 - movdqu xmm5, [esi + 48] - lea esi, [esi + 64] - paddd xmm5, xmm0 - - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - movdqu [edx + 32], xmm4 - movdqu [edx + 48], xmm5 - - lea edx, [edx + 64] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movd xmm2, dword ptr [eax] // 1 argb pixel - lea eax, [eax + 4] - punpcklbw xmm2, xmm1 - punpcklwd xmm2, xmm1 - paddd xmm0, xmm2 - movdqu xmm2, [esi] - lea esi, [esi + 16] - paddd xmm2, xmm0 - movdqu [edx], xmm2 - lea edx, [edx + 16] - sub ecx, 1 - jge l1 - - l1b: - } -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 -// Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 12] // src_argb - mov esi, [esp + 16] // stride - mov edx, [esp + 20] // dst_argb - mov ecx, [esp + 24] // pointer to uv_dudv - movq xmm2, qword ptr [ecx] // uv - movq xmm7, qword ptr [ecx + 8] // dudv - mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride - add esi, 4 - movd xmm5, esi - sub ecx, 4 - jl l4b - - // setup for 4 pixel loop - pshufd xmm7, xmm7, 0x44 // dup dudv - pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 - addps xmm0, xmm7 - movlhps xmm2, xmm0 - movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 - addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 - - // 4 pixel loop - l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd xmm1, [eax + esi] // read pixel 0 - movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 - movq qword ptr [edx], xmm1 - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - movd xmm6, [eax + esi] // read pixel 2 - movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 - movq qword ptr 8[edx], xmm6 - lea edx, [edx + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy - movd esi, xmm0 - movd xmm0, [eax + esi] // copy a pixel - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge l1 - l1b: - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBAFFINEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_AVX2 -// Bilinear filter 32x2 -> 32x1 -__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - sub edi, esi - cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. - - vmovd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - vmovd xmm5, eax // low fraction 256..1 - vpunpcklbw xmm5, xmm5, xmm0 - vpunpcklwd xmm5, xmm5, xmm5 - vbroadcastss ymm5, xmm5 - - mov eax, 0x80808080 // 128b for bias and rounding. - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - - xloop: - vmovdqu ymm0, [esi] - vmovdqu ymm2, [esi + edx] - vpunpckhbw ymm1, ymm0, ymm2 // mutates - vpunpcklbw ymm0, ymm0, ymm2 - vpsubb ymm1, ymm1, ymm4 // bias to signed image - vpsubb ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm5, ymm1 - vpmaddubsw ymm0, ymm5, ymm0 - vpaddw ymm1, ymm1, ymm4 // unbias and round - vpaddw ymm0, ymm0, ymm4 - vpsrlw ymm1, ymm1, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - vmovdqu ymm0, [esi] - vpavgb ymm0, ymm0, [esi + edx] - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - rep movsb - - xloop99: - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_INTERPOLATEROW_AVX2 - -// Bilinear filter 16x2 -> 16x1 -// TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 /256. Blend 100 / 0. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - - movd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - movd xmm5, eax // low fraction 255..1 - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm4, eax - pshufd xmm4, xmm4, 0x00 - - xloop: - movdqu xmm0, [esi] - movdqu xmm2, [esi + edx] - movdqu xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 - psubb xmm1, xmm4 - movdqa xmm2, xmm5 - movdqa xmm3, xmm5 - pmaddubsw xmm2, xmm0 - pmaddubsw xmm3, xmm1 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - psrlw xmm2, 8 - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [esi + edi], xmm2 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - movdqu xmm0, [esi] - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - ret - } -} - -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpshufb ymm0, ymm0, ymm5 - vpshufb ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBSHUFFLEROW_AVX2 - -// YUY2 - Macro-pixel = 2 image pixels -// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... - -// UYVY - Macro-pixel = 2 image pixels -// U0Y0V0Y1 - -__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV - punpckhbw xmm1, xmm2 - movdqu [edi], xmm0 - movdqu [edi + 16], xmm1 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - movdqa xmm1, xmm2 - lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY - punpckhbw xmm2, xmm0 - movdqu [edi], xmm1 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ - pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - - // 2 pixel loop. - convertloop: - // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel - // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel - movq xmm0, qword ptr [eax] // BGRABGRA - lea eax, [eax + 8] - punpcklbw xmm0, xmm3 - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm3 // pixel 0 - punpckhwd xmm4, xmm3 // pixel 1 - cvtdq2ps xmm0, xmm0 // 4 floats - cvtdq2ps xmm4, xmm4 - movdqa xmm1, xmm0 // X - movdqa xmm5, xmm4 - mulps xmm0, [esi + 16] // C1 * X - mulps xmm4, [esi + 16] - addps xmm0, [esi] // result = C0 + C1 * X - addps xmm4, [esi] - movdqa xmm2, xmm1 - movdqa xmm6, xmm5 - mulps xmm2, xmm1 // X * X - mulps xmm6, xmm5 - mulps xmm1, xmm2 // X * X * X - mulps xmm5, xmm6 - mulps xmm2, [esi + 32] // C2 * X * X - mulps xmm6, [esi + 32] - mulps xmm1, [esi + 48] // C3 * X * X * X - mulps xmm5, [esi + 48] - addps xmm0, xmm2 // result += C2 * X * X - addps xmm4, xmm6 - addps xmm0, xmm1 // result += C3 * X * X * X - addps xmm4, xmm5 - cvttps2dq xmm0, xmm0 - cvttps2dq xmm4, xmm4 - packuswb xmm0, xmm4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 - -#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 - vbroadcastf128 ymm5, [ecx + 16] // C1 - vbroadcastf128 ymm6, [ecx + 32] // C2 - vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ - - // 2 pixel loop. - convertloop: - vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels - lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats - vmulps ymm2, ymm0, ymm0 // X * X - vmulps ymm3, ymm0, ymm7 // C3 * X - vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X - vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X - vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X - vcvttps2dq ymm0, ymm0 - vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 - vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 - vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 - vmovq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_AVX2 - -#ifdef HAS_HALFFLOATROW_SSE2 -static float kExpBias = 1.9259299444e-34f; -__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - mulss xmm4, kExpBias - pshufd xmm4, xmm4, 0 - pxor xmm5, xmm5 - sub edx, eax - - // 8 pixel loop. - convertloop: - movdqu xmm2, xmmword ptr [eax] // 8 shorts - add eax, 16 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm5 - cvtdq2ps xmm2, xmm2 // convert 8 ints to floats - punpckhwd xmm3, xmm5 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - psrld xmm2, 13 - psrld xmm3, 13 - packssdw xmm2, xmm3 - movdqu [eax + edx - 16], xmm2 - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_HALFFLOATROW_SSE2 - -#ifdef HAS_HALFFLOATROW_AVX2 -__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - - vmulss xmm4, xmm4, kExpBias - vbroadcastss ymm4, xmm4 - vpxor ymm5, ymm5, ymm5 - sub edx, eax - - // 16 pixel loop. - convertloop: - vmovdqu ymm2, [eax] // 16 shorts - add eax, 32 - vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints - vpunpcklwd ymm2, ymm2, ymm5 - vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats - vcvtdq2ps ymm2, ymm2 - vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. - vmulps ymm2, ymm2, ymm4 - vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate - vpsrld ymm2, ymm2, 13 - vpackssdw ymm2, ymm2, ymm3 - vmovdqu [eax + edx - 32], ymm2 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_AVX2 - -#ifdef HAS_HALFFLOATROW_F16C -__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - vbroadcastss ymm4, [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - sub edx, eax - - // 16 pixel loop. - convertloop: - vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints - vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts - add eax, 32 - vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats - vcvtdq2ps ymm3, ymm3 - vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 - vmulps ymm3, ymm3, ymm4 - vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate - vcvtps2ph xmm3, ymm3, 3 - vmovdqu [eax + edx + 32], xmm2 - vmovdqu [eax + edx + 32 + 16], xmm3 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_F16C - -#ifdef HAS_ARGBCOLORTABLEROW_X86 -// Tranform ARGB pixels with color table. -__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - movzx edx, byte ptr [eax - 4 + 3] - movzx edx, byte ptr [esi + edx * 4 + 3] - mov byte ptr [eax - 4 + 3], dl - dec ecx - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - -#ifdef HAS_RGBCOLORTABLEROW_X86 -// Tranform RGB pixels with color table. -__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - dec ecx - jg convertloop - - pop esi - ret - } -} -#endif // HAS_RGBCOLORTABLEROW_X86 - -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ - movd xmm2, dword ptr [esp + 8 + 16] // luma table - movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff - pshufd xmm2, xmm2, 0 - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 - psllw xmm4, 8 - pxor xmm5, xmm5 - - // 4 pixel loop. - convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr - pmaddubsw xmm0, xmm3 - phaddw xmm0, xmm0 - pand xmm0, xmm4 // mask out low bits - punpcklwd xmm0, xmm5 - paddd xmm0, xmm2 // add table base - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi], dl - movzx edx, byte ptr [eax + 1] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 1], dl - movzx edx, byte ptr [eax + 2] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 2], dl - movzx edx, byte ptr [eax + 3] // copy alpha. - mov byte ptr [edi + 3], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 4] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 4], dl - movzx edx, byte ptr [eax + 5] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 5], dl - movzx edx, byte ptr [eax + 6] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 6], dl - movzx edx, byte ptr [eax + 7] // copy alpha. - mov byte ptr [edi + 7], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 8] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 8], dl - movzx edx, byte ptr [eax + 9] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 9], dl - movzx edx, byte ptr [eax + 10] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 10], dl - movzx edx, byte ptr [eax + 11] // copy alpha. - mov byte ptr [edi + 11], dl - - movd esi, xmm0 - - movzx edx, byte ptr [eax + 12] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 12], dl - movzx edx, byte ptr [eax + 13] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 13], dl - movzx edx, byte ptr [eax + 14] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 14], dl - movzx edx, byte ptr [eax + 15] // copy alpha. - mov byte ptr [edi + 15], dl - - lea eax, [eax + 16] - lea edi, [edi + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 - -#endif // defined(_M_X64) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) diff --git a/drivers/media/pci/tbscapture2/scale_lsx.c b/drivers/media/pci/tbscapture2/scale_lsx.c deleted file mode 100644 index 60646e88faee..000000000000 --- a/drivers/media/pci/tbscapture2/scale_lsx.c +++ /dev/null @@ -1,739 +0,0 @@ -/* - * Copyright 2022 The LibYuv Project Authors. All rights reserved. - * - * Copyright (c) 2022 Loongson Technology Corporation Limited - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "scale_row.h" - -#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) -#include "loongson_intrinsics.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define LOAD_DATA(_src, _in, _out) \ - { \ - int _tmp1, _tmp2, _tmp3, _tmp4; \ - DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \ - _tmp2, _tmp3, _tmp4); \ - _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \ - _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \ - _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \ - _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \ - } - -void ScaleARGBRowDown2_LSX(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - int len = dst_width / 4; - (void)src_stride; - __m128i src0, src1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - dst0 = __lsx_vpickod_w(src1, src0); - __lsx_vst(dst0, dst_argb, 0); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - int len = dst_width / 4; - (void)src_stride; - __m128i src0, src1, tmp0, tmp1, dst0; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); - tmp0 = __lsx_vpickev_w(src1, src0); - tmp1 = __lsx_vpickod_w(src1, src0); - dst0 = __lsx_vavgr_bu(tmp1, tmp0); - __lsx_vst(dst0, dst_argb, 0); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - int len = dst_width / 4; - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0; - __m128i reg0, reg1, reg2, reg3; - __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08}; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1); - DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3); - DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2, - shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, - tmp3, reg0, reg1, reg2, reg3); - DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1); - dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); - __lsx_vst(dst0, dst_argb, 0); - s += 32; - t += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - int len = dst_width / 4; - int32_t stepx = src_stepx << 2; - (void)src_stride; - __m128i dst0, dst1, dst2, dst3; - - for (x = 0; x < len; x++) { - dst0 = __lsx_vldrepl_w(src_argb, 0); - src_argb += stepx; - dst1 = __lsx_vldrepl_w(src_argb, 0); - src_argb += stepx; - dst2 = __lsx_vldrepl_w(src_argb, 0); - src_argb += stepx; - dst3 = __lsx_vldrepl_w(src_argb, 0); - src_argb += stepx; - __lsx_vstelm_w(dst0, dst_argb, 0, 0); - __lsx_vstelm_w(dst1, dst_argb, 4, 0); - __lsx_vstelm_w(dst2, dst_argb, 8, 0); - __lsx_vstelm_w(dst3, dst_argb, 12, 0); - dst_argb += 16; - } -} - -void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - int len = dst_width / 4; - int32_t stepx = src_stepx * 4; - const uint8_t* next_argb = src_argb + src_stride; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, dst0; - - for (x = 0; x < len; x++) { - tmp0 = __lsx_vldrepl_d(src_argb, 0); - src_argb += stepx; - tmp1 = __lsx_vldrepl_d(src_argb, 0); - src_argb += stepx; - tmp2 = __lsx_vldrepl_d(src_argb, 0); - src_argb += stepx; - tmp3 = __lsx_vldrepl_d(src_argb, 0); - src_argb += stepx; - tmp4 = __lsx_vldrepl_d(next_argb, 0); - next_argb += stepx; - tmp5 = __lsx_vldrepl_d(next_argb, 0); - next_argb += stepx; - tmp6 = __lsx_vldrepl_d(next_argb, 0); - next_argb += stepx; - tmp7 = __lsx_vldrepl_d(next_argb, 0); - next_argb += stepx; - DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); - DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); - DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1); - DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5); - DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1); - dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); - dst0 = __lsx_vshuf4i_b(dst0, 0xD8); - __lsx_vst(dst0, dst_argb, 0); - dst_argb += 16; - } -} - -void ScaleRowDown2_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - int len = dst_width / 32; - __m128i src0, src1, src2, src3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1); - __lsx_vst(dst0, dst, 0); - __lsx_vst(dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - int len = dst_width / 32; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); - DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); - DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1); - __lsx_vst(dst0, dst, 0); - __lsx_vst(dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - int len = dst_width / 32; - const uint8_t* src_nex = src_ptr + src_stride; - __m128i src0, src1, src2, src3, src4, src5, src6, src7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i dst0, dst1; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, - src4, src5, src6, src7); - DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp0, tmp2, tmp4, tmp6); - DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp1, tmp3, tmp5, tmp7); - DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, - tmp0, tmp1, tmp2, tmp3); - DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1); - __lsx_vst(dst0, dst, 0); - __lsx_vst(dst1, dst, 16); - src_ptr += 64; - src_nex += 64; - dst += 32; - } -} - -void ScaleRowDown4_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - int len = dst_width / 16; - __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; - (void)src_stride; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1); - dst0 = __lsx_vpickod_b(tmp1, tmp0); - __lsx_vst(dst0, dst, 0); - src_ptr += 64; - dst += 16; - } -} - -void ScaleRowDown4Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - int len = dst_width / 16; - const uint8_t* ptr1 = src_ptr + src_stride; - const uint8_t* ptr2 = ptr1 + src_stride; - const uint8_t* ptr3 = ptr2 + src_stride; - __m128i src0, src1, src2, src3, src4, src5, src6, src7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5, - src6, src7); - DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp0, tmp2, tmp4, tmp6); - DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp1, tmp3, tmp5, tmp7); - DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, - reg0, reg1, reg2, reg3); - DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1, - src2, src3); - DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5, - src6, src7); - DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp0, tmp2, tmp4, tmp6); - DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, - src7, tmp1, tmp3, tmp5, tmp7); - DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, - reg4, reg5, reg6, reg7); - DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, - reg0, reg1, reg2, reg3); - DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, - reg3, reg0, reg1, reg2, reg3); - DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1); - dst0 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(dst0, dst, 0); - src_ptr += 64; - ptr1 += 64; - ptr2 += 64; - ptr3 += 64; - dst += 16; - } -} - -void ScaleRowDown38_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x, len; - __m128i src0, src1, tmp0; - __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816}; - - assert(dst_width % 3 == 0); - len = dst_width / 12; - (void)src_stride; - - for (x = 0; x < len; x++) { - DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); - tmp0 = __lsx_vshuf_b(src1, src0, shuff); - __lsx_vstelm_d(tmp0, dst, 0, 0); - __lsx_vstelm_w(tmp0, dst, 8, 2); - src_ptr += 32; - dst += 12; - } -} - -void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, len; - const uint8_t* src_nex = src_ptr + src_stride; - __m128i src0, src1, src2, src3, dst0; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3; - __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; - __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA); - __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - len = dst_width / 12; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0, - src1, src2, src3); - DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); - DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); - DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); - DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); - tmp4 = __lsx_vpickev_w(reg3, reg2); - tmp5 = __lsx_vadd_h(reg0, reg1); - tmp6 = __lsx_vadd_h(tmp5, tmp4); - tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA); - tmp0 = __lsx_vpickod_w(reg3, reg2); - tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); - tmp2 = __lsx_vmul_w(tmp1, const_0x4000); - dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); - __lsx_vstelm_d(dst0, dst_ptr, 0, 0); - __lsx_vstelm_w(dst0, dst_ptr, 8, 2); - src_ptr += 32; - src_nex += 32; - dst_ptr += 12; - } -} - -void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, len; - const uint8_t* ptr1 = src_ptr + src_stride; - const uint8_t* ptr2 = ptr1 + src_stride; - __m128i src0, src1, src2, src3, src4, src5; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3, dst0; - __m128i zero = __lsx_vldi(0); - __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; - __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71); - __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - len = dst_width / 12; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1, - src2, src3); - DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5); - DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); - DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); - DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6); - DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7); - DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, - tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); - DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); - tmp4 = __lsx_vpickev_w(reg3, reg2); - tmp5 = __lsx_vadd_h(reg0, reg1); - tmp6 = __lsx_vadd_h(tmp5, tmp4); - tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71); - tmp0 = __lsx_vpickod_w(reg3, reg2); - tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); - tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA); - dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); - __lsx_vstelm_d(dst0, dst_ptr, 0, 0); - __lsx_vstelm_w(dst0, dst_ptr, 8, 2); - src_ptr += 32; - ptr1 += 32; - ptr2 += 32; - dst_ptr += 12; - } -} - -void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - int x; - int len = src_width / 16; - __m128i src0, tmp0, tmp1, dst0, dst1; - __m128i zero = __lsx_vldi(0); - - assert(src_width > 0); - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_ptr, 0); - DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1); - tmp0 = __lsx_vilvl_b(zero, src0); - tmp1 = __lsx_vilvh_b(zero, src0); - DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1); - __lsx_vst(dst0, dst_ptr, 0); - __lsx_vst(dst1, dst_ptr, 16); - src_ptr += 16; - dst_ptr += 16; - } -} - -void ScaleFilterCols_LSX(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - int len = dst_width / 16; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128i vec0, vec1, dst0; - __m128i vec_x = __lsx_vreplgr2vr_w(x); - __m128i vec_dx = __lsx_vreplgr2vr_w(dx); - __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF); - __m128i const2 = __lsx_vreplgr2vr_w(0x40); - __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; - - vec0 = __lsx_vmul_w(vec_dx, const_tmp); - vec1 = __lsx_vslli_w(vec_dx, 2); - vec_x = __lsx_vadd_w(vec_x, vec0); - - for (j = 0; j < len; j++) { - tmp0 = __lsx_vsrai_w(vec_x, 16); - tmp4 = __lsx_vand_v(vec_x, const1); - vec_x = __lsx_vadd_w(vec_x, vec1); - tmp1 = __lsx_vsrai_w(vec_x, 16); - tmp5 = __lsx_vand_v(vec_x, const1); - vec_x = __lsx_vadd_w(vec_x, vec1); - tmp2 = __lsx_vsrai_w(vec_x, 16); - tmp6 = __lsx_vand_v(vec_x, const1); - vec_x = __lsx_vadd_w(vec_x, vec1); - tmp3 = __lsx_vsrai_w(vec_x, 16); - tmp7 = __lsx_vand_v(vec_x, const1); - vec_x = __lsx_vadd_w(vec_x, vec1); - DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5, - tmp6, tmp7); - LOAD_DATA(src_ptr, tmp0, reg0); - LOAD_DATA(src_ptr, tmp1, reg1); - LOAD_DATA(src_ptr, tmp2, reg2); - LOAD_DATA(src_ptr, tmp3, reg3); - DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1, - tmp2, tmp3); - LOAD_DATA(src_ptr, tmp0, reg4); - LOAD_DATA(src_ptr, tmp1, reg5); - LOAD_DATA(src_ptr, tmp2, reg6); - LOAD_DATA(src_ptr, tmp3, reg7); - DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3, - reg4, reg5, reg6, reg7); - DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7, - reg4, reg5, reg6, reg7); - DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7, - const2, reg4, reg5, reg6, reg7); - DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5, - reg6, reg7); - DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, - reg0, reg1, reg2, reg3); - DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1); - dst0 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(dst0, dst_ptr, 0); - dst_ptr += 16; - } -} - -void ScaleARGBCols_LSX(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)src_argb; - uint32_t* dst = (uint32_t*)dst_argb; - int j; - int len = dst_width / 4; - __m128i tmp0, tmp1, tmp2, dst0; - __m128i vec_x = __lsx_vreplgr2vr_w(x); - __m128i vec_dx = __lsx_vreplgr2vr_w(dx); - __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; - - tmp0 = __lsx_vmul_w(vec_dx, const_tmp); - tmp1 = __lsx_vslli_w(vec_dx, 2); - vec_x = __lsx_vadd_w(vec_x, tmp0); - - for (j = 0; j < len; j++) { - tmp2 = __lsx_vsrai_w(vec_x, 16); - vec_x = __lsx_vadd_w(vec_x, tmp1); - LOAD_DATA(src, tmp2, dst0); - __lsx_vst(dst0, dst, 0); - dst += 4; - } -} - -void ScaleARGBFilterCols_LSX(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)src_argb; - int j; - int len = dst_width / 8; - __m128i src0, src1, src2, src3; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - __m128i vec0, vec1, dst0, dst1; - __m128i vec_x = __lsx_vreplgr2vr_w(x); - __m128i vec_dx = __lsx_vreplgr2vr_w(dx); - __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; - __m128i const_7f = __lsx_vldi(0x7F); - - vec0 = __lsx_vmul_w(vec_dx, const_tmp); - vec1 = __lsx_vslli_w(vec_dx, 2); - vec_x = __lsx_vadd_w(vec_x, vec0); - - for (j = 0; j < len; j++) { - tmp0 = __lsx_vsrai_w(vec_x, 16); - reg0 = __lsx_vsrai_w(vec_x, 9); - vec_x = __lsx_vadd_w(vec_x, vec1); - tmp1 = __lsx_vsrai_w(vec_x, 16); - reg1 = __lsx_vsrai_w(vec_x, 9); - vec_x = __lsx_vadd_w(vec_x, vec1); - DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1); - DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1); - DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3); - DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6); - DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7); - LOAD_DATA(src, tmp0, src0); - LOAD_DATA(src, tmp1, src1); - DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1); - LOAD_DATA(src, tmp0, src2); - LOAD_DATA(src, tmp1, src3); - DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6); - DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7, - tmp0, tmp1, tmp2, tmp3); - DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1); - __lsx_vst(dst0, dst_argb, 0); - __lsx_vst(dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ScaleRowDown34_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - __m128i src0, src1, src2, src3; - __m128i dst0, dst1, dst2; - __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B}; - __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110}; - __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0, - dst1); - dst2 = __lsx_vshuf_b(src3, src2, shuff2); - __lsx_vst(dst0, dst, 0); - __lsx_vst(dst1, dst, 16); - __lsx_vst(dst2, dst, 32); - src_ptr += 64; - dst += 48; - } -} - -void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* src_nex = src_ptr + src_stride; - int x; - __m128i src0, src1, src2, src3, src4, src5, src6, src7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - __m128i tmp10, tmp11, dst0, dst1, dst2; - __m128i const0 = {0x0103030101010103, 0x0101010303010101}; - __m128i const1 = {0x0301010101030301, 0x0103030101010103}; - __m128i const2 = {0x0101010303010101, 0x0301010101030301}; - __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; - __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; - __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; - __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; - __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; - __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, - src4, src5, src6, src7); - DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, - shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, - shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); - DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, - shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, - const0, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, - const1, src4, src5, src6, src7); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, - const2, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, - shift0, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, - shift1, src4, src5, src6, src7); - DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, - shift2, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6, - tmp7, tmp8); - DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10); - DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5); - DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); - DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1); - dst2 = __lsx_vsrarni_b_h(src5, src4, 2); - __lsx_vst(dst0, d, 0); - __lsx_vst(dst1, d, 16); - __lsx_vst(dst2, d, 32); - src_ptr += 64; - src_nex += 64; - d += 48; - } -} - -void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* src_nex = src_ptr + src_stride; - int x; - __m128i src0, src1, src2, src3, src4, src5, src6, src7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - __m128i tmp10, tmp11, dst0, dst1, dst2; - __m128i const0 = {0x0103030101010103, 0x0101010303010101}; - __m128i const1 = {0x0301010101030301, 0x0103030101010103}; - __m128i const2 = {0x0101010303010101, 0x0301010101030301}; - __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; - __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; - __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; - __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; - __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; - __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, - src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, - src4, src5, src6, src7); - DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, - shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, - shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); - DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, - shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, - const0, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, - const1, src4, src5, src6, src7); - DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, - const2, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, - shift0, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, - shift1, src4, src5, src6, src7); - DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, - shift2, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, - src0, src1, src2, src3); - DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); - DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1); - dst2 = __lsx_vsrarni_b_h(src5, src4, 1); - __lsx_vst(dst0, d, 0); - __lsx_vst(dst1, d, 16); - __lsx_vst(dst2, d, 32); - src_ptr += 64; - src_nex += 64; - d += 48; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/drivers/media/pci/tbscapture2/scale_msa.c b/drivers/media/pci/tbscapture2/scale_msa.c deleted file mode 100644 index 7b551186c034..000000000000 --- a/drivers/media/pci/tbscapture2/scale_msa.c +++ /dev/null @@ -1,949 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "scale_row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ - { \ - out0[0] = srcp[indx0[0]]; \ - out0[1] = srcp[indx0[1]]; \ - out0[2] = srcp[indx0[2]]; \ - out0[3] = srcp[indx0[3]]; \ - } - -void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - v16u8 src0, src1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - ST_UB(dst0, dst_argb); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - v16u8 src0, src1, vec0, vec1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); - ST_UB(dst0, dst_argb); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; - v8u16 reg0, reg1, reg2, reg3; - v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); - vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); - vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); - reg0 = __msa_hadd_u_h(vec0, vec0); - reg1 = __msa_hadd_u_h(vec1, vec1); - reg2 = __msa_hadd_u_h(vec2, vec2); - reg3 = __msa_hadd_u_h(vec3, vec3); - reg0 += reg2; - reg1 += reg3; - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); - reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_argb); - s += 32; - t += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - int32_t stepx = src_stepx * 4; - int32_t data0, data1, data2, data3; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - data0 = LW(src_argb); - data1 = LW(src_argb + stepx); - data2 = LW(src_argb + stepx * 2); - data3 = LW(src_argb + stepx * 3); - SW(data0, dst_argb); - SW(data1, dst_argb + 4); - SW(data2, dst_argb + 8); - SW(data3, dst_argb + 12); - src_argb += stepx * 4; - dst_argb += 16; - } -} - -void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - const uint8_t* nxt_argb = src_argb + src_stride; - int32_t stepx = src_stepx * 4; - int64_t data0, data1, data2, data3; - v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; - v16u8 vec0, vec1, vec2, vec3; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 dst0; - - for (x = 0; x < dst_width; x += 4) { - data0 = LD(src_argb); - data1 = LD(src_argb + stepx); - data2 = LD(src_argb + stepx * 2); - data3 = LD(src_argb + stepx * 3); - src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); - src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); - src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); - src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); - data0 = LD(nxt_argb); - data1 = LD(nxt_argb + stepx); - data2 = LD(nxt_argb + stepx * 2); - data3 = LD(nxt_argb + stepx * 3); - src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); - src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); - src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); - src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); - vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - reg0 = __msa_hadd_u_h(vec0, vec0); - reg1 = __msa_hadd_u_h(vec1, vec1); - reg2 = __msa_hadd_u_h(vec2, vec2); - reg3 = __msa_hadd_u_h(vec3, vec3); - reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); - reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); - reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); - reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); - reg4 += reg6; - reg5 += reg7; - reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); - reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - ST_UB(dst0, dst_argb); - src_argb += stepx * 4; - nxt_argb += stepx * 4; - dst_argb += 16; - } -} - -void ScaleRowDown2_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = __msa_aver_u_b(vec1, vec0); - dst1 = __msa_aver_u_b(vec3, vec2); - ST_UB2(dst0, dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = __msa_hadd_u_h(src0, src0); - vec1 = __msa_hadd_u_h(src1, src1); - vec2 = __msa_hadd_u_h(src2, src2); - vec3 = __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); - vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); - vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); - vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - ST_UB2(dst0, dst1, dst, 16); - s += 64; - t += 64; - dst += 32; - } -} - -void ScaleRowDown4_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst); - src_ptr += 64; - dst += 16; - } -} - -void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - const uint8_t* s = src_ptr; - const uint8_t* t0 = s + src_stride; - const uint8_t* t1 = s + src_stride * 2; - const uint8_t* t2 = s + src_stride * 3; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; - v8u16 vec0, vec1, vec2, vec3; - v4u32 reg0, reg1, reg2, reg3; - - for (x = 0; x < dst_width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); - vec0 = __msa_hadd_u_h(src0, src0); - vec1 = __msa_hadd_u_h(src1, src1); - vec2 = __msa_hadd_u_h(src2, src2); - vec3 = __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); - vec0 += __msa_hadd_u_h(src0, src0); - vec1 += __msa_hadd_u_h(src1, src1); - vec2 += __msa_hadd_u_h(src2, src2); - vec3 += __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - reg0 = __msa_hadd_u_w(vec0, vec0); - reg1 = __msa_hadd_u_w(vec1, vec1); - reg2 = __msa_hadd_u_w(vec2, vec2); - reg3 = __msa_hadd_u_w(vec3, vec3); - reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); - reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); - reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); - reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst); - s += 64; - t0 += 64; - t1 += 64; - t2 += 64; - dst += 16; - } -} - -void ScaleRowDown38_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x, width; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, vec0; - v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; - (void)src_stride; - - assert(dst_width % 3 == 0); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); - dst0 = __msa_copy_u_d((v2i64)vec0, 0); - dst1 = __msa_copy_u_w((v4i32)vec0, 2); - SD(dst0, dst); - SW(dst1, dst + 8); - src_ptr += 32; - dst += 12; - } -} - -void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, width; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, src2, src3, out; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; - v8i16 zero = {0}; - v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; - v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; - v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); - v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); - vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); - vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); - vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - tmp0 = __msa_hadd_u_w(vec4, vec4); - tmp1 = __msa_hadd_u_w(vec5, vec5); - tmp2 = __msa_hadd_u_w(vec6, vec6); - tmp3 = __msa_hadd_u_w(vec7, vec7); - tmp4 = __msa_hadd_u_w(vec0, vec0); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - tmp0 = __msa_hadd_u_w(vec0, vec0); - tmp1 = __msa_hadd_u_w(vec1, vec1); - tmp0 *= const_0x2AAA; - tmp1 *= const_0x2AAA; - tmp4 *= const_0x4000; - tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); - tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); - tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); - out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); - dst0 = __msa_copy_u_d((v2i64)out, 0); - dst1 = __msa_copy_u_w((v4i32)out, 2); - SD(dst0, dst_ptr); - SW(dst1, dst_ptr + 8); - s += 32; - t += 32; - dst_ptr += 12; - } -} - -void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, width; - const uint8_t* s = src_ptr; - const uint8_t* t0 = s + src_stride; - const uint8_t* t1 = s + src_stride * 2; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, src2, src3, src4, src5, out; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; - v8u16 zero = {0}; - v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; - v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; - v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); - v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); - src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); - vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); - vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); - vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); - vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); - vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); - vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - tmp0 = __msa_hadd_u_w(vec4, vec4); - tmp1 = __msa_hadd_u_w(vec5, vec5); - tmp2 = __msa_hadd_u_w(vec6, vec6); - tmp3 = __msa_hadd_u_w(vec7, vec7); - tmp4 = __msa_hadd_u_w(vec0, vec0); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - tmp0 = __msa_hadd_u_w(vec0, vec0); - tmp1 = __msa_hadd_u_w(vec1, vec1); - tmp0 *= const_0x1C71; - tmp1 *= const_0x1C71; - tmp4 *= const_0x2AAA; - tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); - tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); - tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); - out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); - dst0 = __msa_copy_u_d((v2i64)out, 0); - dst1 = __msa_copy_u_w((v4i32)out, 2); - SD(dst0, dst_ptr); - SW(dst1, dst_ptr + 8); - s += 32; - t0 += 32; - t1 += 32; - dst_ptr += 12; - } -} - -void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - int x; - v16u8 src0; - v8u16 dst0, dst1; - v16i8 zero = {0}; - - assert(src_width > 0); - - for (x = 0; x < src_width; x += 16) { - src0 = LD_UB(src_ptr); - dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); - dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); - dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - ST_UH2(dst0, dst1, dst_ptr, 8); - src_ptr += 16; - dst_ptr += 16; - } -} - -void ScaleFilterCols_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - v4i32 vec_x = __msa_fill_w(x); - v4i32 vec_dx = __msa_fill_w(dx); - v4i32 vec_const = {0, 1, 2, 3}; - v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8u16 reg0, reg1; - v16u8 dst0; - v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); - v4i32 const_0x40 = __msa_fill_w(0x40); - - vec0 = vec_dx * vec_const; - vec1 = vec_dx * 4; - vec_x += vec0; - - for (j = 0; j < dst_width - 1; j += 16) { - vec2 = vec_x >> 16; - vec6 = vec_x & const_0xFFFF; - vec_x += vec1; - vec3 = vec_x >> 16; - vec7 = vec_x & const_0xFFFF; - vec_x += vec1; - vec4 = vec_x >> 16; - vec8 = vec_x & const_0xFFFF; - vec_x += vec1; - vec5 = vec_x >> 16; - vec9 = vec_x & const_0xFFFF; - vec_x += vec1; - vec6 >>= 9; - vec7 >>= 9; - vec8 >>= 9; - vec9 >>= 9; - LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); - LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); - LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); - LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); - vec2 += 1; - vec3 += 1; - vec4 += 1; - vec5 += 1; - LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); - LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); - LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); - LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); - tmp4 -= tmp0; - tmp5 -= tmp1; - tmp6 -= tmp2; - tmp7 -= tmp3; - tmp4 *= vec6; - tmp5 *= vec7; - tmp6 *= vec8; - tmp7 *= vec9; - tmp4 += const_0x40; - tmp5 += const_0x40; - tmp6 += const_0x40; - tmp7 += const_0x40; - tmp4 >>= 7; - tmp5 >>= 7; - tmp6 >>= 7; - tmp7 >>= 7; - tmp0 += tmp4; - tmp1 += tmp5; - tmp2 += tmp6; - tmp3 += tmp7; - reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - __msa_st_b(dst0, dst_ptr, 0); - dst_ptr += 16; - } -} - -void ScaleARGBCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - v4i32 x_vec = __msa_fill_w(x); - v4i32 dx_vec = __msa_fill_w(dx); - v4i32 const_vec = {0, 1, 2, 3}; - v4i32 vec0, vec1, vec2; - v4i32 dst0; - - vec0 = dx_vec * const_vec; - vec1 = dx_vec * 4; - x_vec += vec0; - - for (j = 0; j < dst_width; j += 4) { - vec2 = x_vec >> 16; - x_vec += vec1; - LOAD_INDEXED_DATA(src, vec2, dst0); - __msa_st_w(dst0, dst, 0); - dst += 4; - } -} - -void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - int j; - v4u32 src0, src1, src2, src3; - v4u32 vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 mult0, mult1, mult2, mult3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 dst0, dst1; - v4u32 vec_x = (v4u32)__msa_fill_w(x); - v4u32 vec_dx = (v4u32)__msa_fill_w(dx); - v4u32 vec_const = {0, 1, 2, 3}; - v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); - - vec0 = vec_dx * vec_const; - vec1 = vec_dx * 4; - vec_x += vec0; - - for (j = 0; j < dst_width - 1; j += 8) { - vec2 = vec_x >> 16; - reg0 = (v16u8)(vec_x >> 9); - vec_x += vec1; - vec3 = vec_x >> 16; - reg1 = (v16u8)(vec_x >> 9); - vec_x += vec1; - reg0 = reg0 & const_0x7f; - reg1 = reg1 & const_0x7f; - reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); - reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); - reg2 = reg0 ^ const_0x7f; - reg3 = reg1 ^ const_0x7f; - mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); - mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); - mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); - mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); - LOAD_INDEXED_DATA(src, vec2, src0); - LOAD_INDEXED_DATA(src, vec3, src1); - vec2 += 1; - vec3 += 1; - LOAD_INDEXED_DATA(src, vec2, src2); - LOAD_INDEXED_DATA(src, vec3, src3); - reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - tmp0 = __msa_dotp_u_h(reg4, mult0); - tmp1 = __msa_dotp_u_h(reg5, mult1); - tmp2 = __msa_dotp_u_h(reg6, mult2); - tmp3 = __msa_dotp_u_h(reg7, mult3); - tmp0 >>= 7; - tmp1 >>= 7; - tmp2 >>= 7; - tmp3 >>= 7; - dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - __msa_st_b(dst0, dst_argb, 0); - __msa_st_b(dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ScaleRowDown34_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - v16u8 src0, src1, src2, src3; - v16u8 vec0, vec1, vec2; - v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; - v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; - v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, - 21, 23, 24, 25, 27, 28, 29, 31}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); - __msa_st_b((v16i8)vec0, dst, 0); - __msa_st_b((v16i8)vec1, dst, 16); - __msa_st_b((v16i8)vec2, dst, 32); - src_ptr += 64; - dst += 48; - } -} - -void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 vec6, vec7, vec8, vec9, vec10, vec11; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5; - v8i16 reg6, reg7, reg8, reg9, reg10, reg11; - v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; - v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; - v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; - v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, - 16, 17, 17, 18, 18, 19, 20, 21}; - v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; - v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; - v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; - v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); - vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); - vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); - vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); - vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); - vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); - vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); - vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); - reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); - reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); - reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); - reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); - reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); - reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); - reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); - reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); - reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); - reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); - reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); - reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); - reg0 = __msa_srar_h(reg0, shft0); - reg1 = __msa_srar_h(reg1, shft1); - reg2 = __msa_srar_h(reg2, shft2); - reg3 = __msa_srar_h(reg3, shft0); - reg4 = __msa_srar_h(reg4, shft1); - reg5 = __msa_srar_h(reg5, shft2); - reg6 = __msa_srar_h(reg6, shft0); - reg7 = __msa_srar_h(reg7, shft1); - reg8 = __msa_srar_h(reg8, shft2); - reg9 = __msa_srar_h(reg9, shft0); - reg10 = __msa_srar_h(reg10, shft1); - reg11 = __msa_srar_h(reg11, shft2); - reg0 = reg0 * 3 + reg6; - reg1 = reg1 * 3 + reg7; - reg2 = reg2 * 3 + reg8; - reg3 = reg3 * 3 + reg9; - reg4 = reg4 * 3 + reg10; - reg5 = reg5 * 3 + reg11; - reg0 = __msa_srari_h(reg0, 2); - reg1 = __msa_srari_h(reg1, 2); - reg2 = __msa_srari_h(reg2, 2); - reg3 = __msa_srari_h(reg3, 2); - reg4 = __msa_srari_h(reg4, 2); - reg5 = __msa_srari_h(reg5, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - __msa_st_b((v16i8)dst0, d, 0); - __msa_st_b((v16i8)dst1, d, 16); - __msa_st_b((v16i8)dst2, d, 32); - s += 64; - t += 64; - d += 48; - } -} - -void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 vec6, vec7, vec8, vec9, vec10, vec11; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5; - v8i16 reg6, reg7, reg8, reg9, reg10, reg11; - v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; - v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; - v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; - v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, - 16, 17, 17, 18, 18, 19, 20, 21}; - v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; - v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; - v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; - v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); - vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); - vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); - vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); - vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); - vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); - vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); - vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); - reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); - reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); - reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); - reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); - reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); - reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); - reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); - reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); - reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); - reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); - reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); - reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); - reg0 = __msa_srar_h(reg0, shft0); - reg1 = __msa_srar_h(reg1, shft1); - reg2 = __msa_srar_h(reg2, shft2); - reg3 = __msa_srar_h(reg3, shft0); - reg4 = __msa_srar_h(reg4, shft1); - reg5 = __msa_srar_h(reg5, shft2); - reg6 = __msa_srar_h(reg6, shft0); - reg7 = __msa_srar_h(reg7, shft1); - reg8 = __msa_srar_h(reg8, shft2); - reg9 = __msa_srar_h(reg9, shft0); - reg10 = __msa_srar_h(reg10, shft1); - reg11 = __msa_srar_h(reg11, shft2); - reg0 += reg6; - reg1 += reg7; - reg2 += reg8; - reg3 += reg9; - reg4 += reg10; - reg5 += reg11; - reg0 = __msa_srari_h(reg0, 1); - reg1 = __msa_srari_h(reg1, 1); - reg2 = __msa_srari_h(reg2, 1); - reg3 = __msa_srari_h(reg3, 1); - reg4 = __msa_srari_h(reg4, 1); - reg5 = __msa_srari_h(reg5, 1); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - __msa_st_b((v16i8)dst0, d, 0); - __msa_st_b((v16i8)dst1, d, 16); - __msa_st_b((v16i8)dst2, d, 32); - s += 64; - t += 64; - d += 48; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/drivers/media/pci/tbscapture2/scale_neon.c b/drivers/media/pci/tbscapture2/scale_neon.c deleted file mode 100644 index 85dbcbb25f0f..000000000000 --- a/drivers/media/pci/tbscapture2/scale_neon.c +++ /dev/null @@ -1,1449 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" -#include "scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// NEON downscalers with interpolation. -// Provided by Fritz Koenig - -// Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} - -// Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} - -// Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + - // row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and - // pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleRowDown4_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc"); -} - -void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [%3]! \n" - "vld1.8 {q2}, [%4]! \n" - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 - : - : "q0", "q1", "q2", "q3", "memory", "cc"); -} - -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc"); -} - -void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" - - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" - - // (3 * line_0 + line_1 + 2) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" - - "vst3.8 {d0, d1, d2}, [%1]! \n" - - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", - "cc"); -} - -void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" - - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); -} - -#define HAS_SCALEROWDOWN38_NEON -static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, - 22, 24, 27, 30, 0, 0, 0, 0}; -static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, - 18, 6, 14, 19, 0, 0, 0, 0}; -static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12}; -static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18}; - -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile ( - "vld1.8 {q3}, [%3] \n" - "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); -} - -// 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - - asm volatile ( - "vld1.16 {q13}, [%5] \n" - "vld1.8 {q14}, [%6] \n" - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", - "cc"); -} - -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile ( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); -} - -void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 1; - asm volatile ( - "vmov.u8 d30, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 01234567 - "vld1.8 {d5}, [%3]! \n" // 12345678 - - "vmovl.u8 q0, d4 \n" // 01234567 (16b) - "vmovl.u8 q1, d5 \n" // 12345678 (16b) - "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) - "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) - - "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.8 {d0, d1}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 1; - const uint8_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "vmov.u16 q15, #3 \n" - "vmov.u8 d28, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 01234567 - "vld1.8 {d5}, [%5]! \n" // 12345678 - - "vmovl.u8 q0, d4 \n" // 01234567 (16b) - "vmovl.u8 q1, d5 \n" // 12345678 (16b) - "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) - "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) - - "vld1.8 {d8}, [%1]! \n" - "vld1.8 {d9}, [%6]! \n" - - "vmovl.u8 q2, d8 \n" - "vmovl.u8 q3, d9 \n" - "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) - "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) - - // e o - // q1 q0 - // q3 q2 - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - // e o - // q5 q4 - // q1 q0 - - "vrshrn.u16 d2, q1, #4 \n" // 2, even - "vrshrn.u16 d3, q0, #4 \n" // 2, odd - "vrshrn.u16 d0, q5, #4 \n" // 1, even - "vrshrn.u16 d1, q4, #4 \n" // 1, odd - - "vst2.8 {d0, d1}, [%2]! \n" // store - "vst2.8 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", - "q15" // Clobber List - ); -} - -void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile ( - "vmov.u16 q15, #3 \n" - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) - - "vmovq q2, q0 \n" - "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) - "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - - "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "vmov.u16 q15, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) - - "vmovq q2, q0 \n" - "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) - "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - - "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) - "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) - - "vmovq q4, q2 \n" - "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) - "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - "vrshr.u16 q2, q1, #4 \n" // 2, even - "vrshr.u16 q3, q0, #4 \n" // 2, odd - "vrshr.u16 q0, q5, #4 \n" // 1, even - "vrshr.u16 q1, q4, #4 \n" // 1, odd - - "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store - "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store - "subs %4, %4, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", - "q15" // Clobber List - ); -} - -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile ( - "vmov.u16 d31, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) - - "vmovl.u16 q2, d0 \n" // 0123 (32b) - "vmovl.u16 q3, d1 \n" // 4567 (32b) - "vmovl.u16 q4, d2 \n" // 1234 (32b) - "vmovl.u16 q5, d3 \n" // 5678 (32b) - - "vmlal.u16 q2, d2, d31 \n" - "vmlal.u16 q3, d3, d31 \n" - "vmlal.u16 q4, d0, d31 \n" - "vmlal.u16 q5, d1, d31 \n" - - "vrshrn.u32 d0, q4, #2 \n" - "vrshrn.u32 d1, q5, #2 \n" - "vrshrn.u32 d2, q2, #2 \n" - "vrshrn.u32 d3, q3, #2 \n" - - "vst2.16 {q0, q1}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "vmov.u16 d31, #3 \n" - "vmov.u32 q14, #3 \n" - - "1: \n" - "vld1.16 {d0}, [%0]! \n" // 0123 (16b) - "vld1.16 {d1}, [%5]! \n" // 1234 (16b) - "vmovl.u16 q2, d0 \n" // 0123 (32b) - "vmovl.u16 q3, d1 \n" // 1234 (32b) - "vmlal.u16 q2, d1, d31 \n" - "vmlal.u16 q3, d0, d31 \n" - - "vld1.16 {d0}, [%1]! \n" // 0123 (16b) - "vld1.16 {d1}, [%6]! \n" // 1234 (16b) - "vmovl.u16 q4, d0 \n" // 0123 (32b) - "vmovl.u16 q5, d1 \n" // 1234 (32b) - "vmlal.u16 q4, d1, d31 \n" - "vmlal.u16 q5, d0, d31 \n" - - "vmovq q0, q4 \n" - "vmovq q1, q5 \n" - "vmla.u32 q4, q2, q14 \n" - "vmla.u32 q5, q3, q14 \n" - "vmla.u32 q2, q0, q14 \n" - "vmla.u32 q3, q1, q14 \n" - - "vrshrn.u32 d1, q4, #4 \n" - "vrshrn.u32 d0, q5, #4 \n" - "vrshrn.u32 d3, q2, #4 \n" - "vrshrn.u32 d2, q3, #4 \n" - - "vst2.16 {d0, d1}, [%2]! \n" // store - "vst2.16 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #8 \n" // 4 sample -> 8 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", - "d31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 2; - asm volatile ( - "vmov.u8 d30, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) - "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) - - "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) - "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) - - "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.16 {d0, d1}, [%1]! \n" // store - "subs %2, %2, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 2; - const uint8_t* src_temp1 = src_ptr1 + 2; - - asm volatile ( - "vmov.u16 q15, #3 \n" - "vmov.u8 d28, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) - "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) - - "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) - "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) - - "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) - "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) - - "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) - "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) - - // e o - // q1 q0 - // q3 q2 - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - // e o - // q5 q4 - // q1 q0 - - "vrshrn.u16 d2, q1, #4 \n" // 2, even - "vrshrn.u16 d3, q0, #4 \n" // 2, odd - "vrshrn.u16 d0, q5, #4 \n" // 1, even - "vrshrn.u16 d1, q4, #4 \n" // 1, odd - - "vst2.16 {d0, d1}, [%2]! \n" // store - "vst2.16 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", - "q15" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 2; - asm volatile ( - "vmov.u16 d30, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) - "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) - - "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) - "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) - "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) - "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) - "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) - "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) - "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) - - "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) - "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.32 {d0, d1}, [%1]! \n" // store - "vst2.32 {d2, d3}, [%1]! \n" // store - "subs %2, %2, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", - "d30" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 2; - const uint16_t* src_temp1 = src_ptr1 + 2; - - asm volatile ( - "vmov.u16 d30, #3 \n" - "vmov.u32 q14, #3 \n" - - "1: \n" - "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) - "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) - "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) - "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) - "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) - - "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) - "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) - "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) - "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) - "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) - - "vmovq q0, q4 \n" - "vmovq q1, q5 \n" - "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) - "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) - "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) - "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) - - "vrshrn.u32 d1, q4, #4 \n" // 1, odd - "vrshrn.u32 d0, q5, #4 \n" // 1, even - "vrshrn.u32 d3, q2, #4 \n" // 2, odd - "vrshrn.u32 d2, q3, #4 \n" // 2, even - - "vst2.32 {d0, d1}, [%2]! \n" // store - "vst2.32 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #4 \n" // 2 uv -> 4 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", - "d30" // Clobber List - ); -} - -// Add a row of bytes to a row of shorts. Used for box filter. -// Reads 16 bytes and accumulates to 16 shorts at a time. -void ScaleAddRow_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile ( - "1: \n" - "vld1.16 {q1, q2}, [%1] \n" // load accumulator - "vld1.8 {q0}, [%0]! \n" // load 16 bytes - "vaddw.u8 q2, q2, d1 \n" // add - "vaddw.u8 q1, q1, d0 \n" - "vst1.16 {q1, q2}, [%1]! \n" // store accumulator - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2" // Clobber List - ); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" - -// The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8_t)((int)(a) + -// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) - -void ScaleFilterCols_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_ptr; - asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q3, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q1, q1, q0 \n" - // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "vadd.s32 q2, q1, q3 \n" - "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" - LOAD2_DATA8_LANE(0) - LOAD2_DATA8_LANE(1) - LOAD2_DATA8_LANE(2) - LOAD2_DATA8_LANE(3) - LOAD2_DATA8_LANE(4) - LOAD2_DATA8_LANE(5) - LOAD2_DATA8_LANE(6) - LOAD2_DATA8_LANE(7) - "vmov q10, q1 \n" - "vmov q11, q2 \n" - "vuzp.16 q10, q11 \n" - "vmovl.u8 q8, d6 \n" - "vmovl.u8 q9, d7 \n" - "vsubl.s16 q11, d18, d16 \n" - "vsubl.s16 q12, d19, d17 \n" - "vmovl.u16 q13, d20 \n" - "vmovl.u16 q10, d21 \n" - "vmul.s32 q11, q11, q13 \n" - "vmul.s32 q12, q12, q10 \n" - "vrshrn.s32 d18, q11, #16 \n" - "vrshrn.s32 d19, q12, #16 \n" - "vadd.s16 q8, q8, q9 \n" - "vmovn.s16 d6, q8 \n" - - "vst1.8 {d6}, [%0]! \n" // store pixels - "vadd.s32 q1, q1, q0 \n" - "vadd.s32 q2, q2, q0 \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13" - ); -} - -#undef LOAD2_DATA8_LANE - -void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vmov q2, q1 \n" // load next 8 ARGB - "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! -// 4a: 3e04 subs r6, #4 -// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! -// 50: ef64 21f4 vorr q9, q10, q10 -// 54: f942 038d vst2.32 {d16-d19}, [r2]! -// 58: d1f5 bne.n 46 - -void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vrhadd.u8 q1, q2, q3 \n" // rounding half add - "vst2.32 {q0, q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile ( - "mov r12, %3, lsl #2 \n" - "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "vld1.32 {" #dn "[" #n "]}, [%6] \n" - -void ScaleARGBCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int tmp; - const uint8_t* src_tmp = src_argb; - asm volatile ( - "1: \n" - // clang-format off - LOAD1_DATA32_LANE(d0, 0) - LOAD1_DATA32_LANE(d0, 1) - LOAD1_DATA32_LANE(d1, 0) - LOAD1_DATA32_LANE(d1, 1) - LOAD1_DATA32_LANE(d2, 0) - LOAD1_DATA32_LANE(d2, 1) - LOAD1_DATA32_LANE(d3, 0) - LOAD1_DATA32_LANE(d3, 1) - // clang-format on - "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1"); -} - -#undef LOAD1_DATA32_LANE - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" - -void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_argb; - asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q9, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - "vmov.i8 q3, #0x7f \n" // 0x7F - "vmov.i16 q15, #0x7f \n" // 0x7F - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q8, q1, q0 \n" - "1: \n" - // d0, d1: a - // d2, d3: b - LOAD2_DATA32_LANE(d0, d2, 0) - LOAD2_DATA32_LANE(d0, d2, 1) - LOAD2_DATA32_LANE(d1, d3, 0) - LOAD2_DATA32_LANE(d1, d3, 1) - "vshrn.i32 d22, q8, #9 \n" - "vand.16 d22, d22, d30 \n" - "vdup.8 d24, d22[0] \n" - "vdup.8 d25, d22[2] \n" - "vdup.8 d26, d22[4] \n" - "vdup.8 d27, d22[6] \n" - "vext.8 d4, d24, d25, #4 \n" - "vext.8 d5, d26, d27, #4 \n" // f - "veor.8 q10, q2, q3 \n" // 0x7f ^ f - "vmull.u8 q11, d0, d20 \n" - "vmull.u8 q12, d1, d21 \n" - "vmull.u8 q13, d2, d4 \n" - "vmull.u8 q14, d3, d5 \n" - "vadd.i16 q11, q11, q13 \n" - "vadd.i16 q12, q12, q14 \n" - "vshrn.i16 d0, q11, #7 \n" - "vshrn.i16 d1, q12, #7 \n" - - "vst1.32 {d0, d1}, [%0]! \n" // store pixels - "vadd.s32 q8, q8, q9 \n" - "subs %2, %2, #4 \n" // 4 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -#undef LOAD2_DATA32_LANE - -void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. - "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.16 {q1}, [%1]! \n" // store 8 UV - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1"); -} - -void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. - "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV - "subs %2, %2, #8 \n" // 8 processed per loop. - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.16 {q0}, [%1]! \n" // store 8 UV - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1"); -} - -void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. - "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. - "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV - "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV - "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vst2.8 {d0, d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q8", "q9"); -} - -// Reads 4 pixels at a time. -void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, // pixel step - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src1_ptr = src_ptr + src_stepx * 2; - const uint8_t* src2_ptr = src_ptr + src_stepx * 4; - const uint8_t* src3_ptr = src_ptr + src_stepx * 6; - (void)src_stride; - asm volatile ( - "1: \n" - "vld1.16 {d0[0]}, [%0], %6 \n" - "vld1.16 {d0[1]}, [%1], %6 \n" - "vld1.16 {d0[2]}, [%2], %6 \n" - "vld1.16 {d0[3]}, [%3], %6 \n" - "subs %5, %5, #4 \n" // 4 pixels per loop. - "vst1.8 {d0}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src1_ptr), // %1 - "+r"(src2_ptr), // %2 - "+r"(src3_ptr), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_width) // %5 - : "r"(src_stepx * 8) // %6 - : "memory", "cc", "d0"); -} - -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/scale_neon64.c b/drivers/media/pci/tbscapture2/scale_neon64.c deleted file mode 100644 index f0fef300ae25..000000000000 --- a/drivers/media/pci/tbscapture2/scale_neon64.c +++ /dev/null @@ -1,1576 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" -#include "scale.h" -#include "scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1" // Clobber List - ); -} - -// Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1" // Clobber List - ); -} - -// Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ScaleRowDown4_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile( - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load up 16x8 - "ldp q1, q5, [%2], #32 \n" - "ldp q2, q6, [%3], #32 \n" - "ldp q3, q7, [%4], #32 \n" - "subs %w5, %w5, #8 \n" - "uaddlp v0.8h, v0.16b \n" - "uaddlp v4.8h, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.8h, v1.16b \n" - "uadalp v4.8h, v5.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v4.8h, v6.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uadalp v0.8h, v3.16b \n" - "uadalp v4.8h, v7.16b \n" - "prfm pldl1keep, [%4, 448] \n" - "addp v0.8h, v0.8h, v4.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "str d0, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -static const uvec8 kShuf34_0 = { - 0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, -}; -static const uvec8 kShuf34_1 = { - 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, -}; -static const uvec8 kShuf34_2 = { - 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31, -}; - -// Down scale from 4 to 3 pixels. Point samples 64 pixels to 48 pixels. -void ScaleRowDown34_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "ld1 {v29.16b}, [%[kShuf34_0]] \n" - "ld1 {v30.16b}, [%[kShuf34_1]] \n" - "ld1 {v31.16b}, [%[kShuf34_2]] \n" - "1: \n" - "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" - "subs %w[width], %w[width], #48 \n" - "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" - "prfm pldl1keep, [%[src_ptr], 448] \n" - "tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n" - "tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n" - "st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n" - "b.gt 1b \n" - : [src_ptr] "+r"(src_ptr), // %[src_ptr] - [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] - [width] "+r"(dst_width) // %[width] - : [kShuf34_0] "r"(&kShuf34_0), // %[kShuf34_0] - [kShuf34_1] "r"(&kShuf34_1), // %[kShuf34_1] - [kShuf34_2] "r"(&kShuf34_2) // %[kShuf34_2] - : "memory", "cc", "v0", "v1", "v2", "v3", "v29", "v30", "v31"); -} - -void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movi v24.16b, #3 \n" - "add %3, %3, %0 \n" - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 - "subs %w2, %w2, #48 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" - "ushll2 v20.8h, v4.16b, #0 \n" - "ushll2 v21.8h, v5.16b, #0 \n" - "ushll2 v22.8h, v6.16b, #0 \n" - "ushll2 v23.8h, v7.16b, #0 \n" - - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v24.8b \n" - "umlal v17.8h, v1.8b, v24.8b \n" - "umlal v18.8h, v2.8b, v24.8b \n" - "umlal v19.8h, v3.8b, v24.8b \n" - "umlal2 v20.8h, v0.16b, v24.16b \n" - "umlal2 v21.8h, v1.16b, v24.16b \n" - "umlal2 v22.8h, v2.16b, v24.16b \n" - "umlal2 v23.8h, v3.16b, v24.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // (3 * line_0 + line_1 + 2) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" - "uqrshrn2 v0.16b, v20.8h, #2 \n" - "uqrshrn2 v1.16b, v21.8h, #2 \n" - "uqrshrn2 v2.16b, v22.8h, #2 \n" - "uqrshrn2 v3.16b, v23.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "ushll2 v17.8h, v1.16b, #0 \n" - "umlal v16.8h, v0.8b, v24.8b \n" - "umlal2 v17.8h, v0.16b, v24.16b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn2 v0.16b, v17.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "urhadd v1.16b, v1.16b, v2.16b \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "ushll2 v17.8h, v2.16b, #0 \n" - "umlal v16.8h, v3.8b, v24.8b \n" - "umlal2 v17.8h, v3.16b, v24.16b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" - "uqrshrn2 v2.16b, v17.8h, #2 \n" - - "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" - - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"); -} - -void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movi v20.16b, #3 \n" - "add %3, %3, %0 \n" - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 - "subs %w2, %w2, #48 \n" - // average src line 0 with src line 1 - "urhadd v0.16b, v0.16b, v4.16b \n" - "urhadd v1.16b, v1.16b, v5.16b \n" - "urhadd v2.16b, v2.16b, v6.16b \n" - "urhadd v3.16b, v3.16b, v7.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "ushll2 v5.8h, v1.16b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "umlal2 v5.8h, v0.16b, v20.16b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" - "uqrshrn2 v0.16b, v5.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "urhadd v1.16b, v1.16b, v2.16b \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "ushll2 v5.8h, v2.16b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "umlal2 v5.8h, v3.16b, v20.16b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" - "uqrshrn2 v2.16b, v5.8h, #2 \n" - - "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20"); -} - -static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, - 22, 24, 27, 30, 0, 0, 0, 0}; -static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, - 34, 6, 22, 35, 0, 0, 0, 0}; -static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12}; -static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18}; - -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile ( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -// 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - ptrdiff_t tmp_src_stride = src_stride; - - asm volatile ( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" - - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" - - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" - "prfm pldl1keep, [%3, 448] \n" - - // Align for table lookup, vtbl requires registers to be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", - "v30", "v31"); -} - -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - // TODO(fbarchard): use src_stride directly for clang 3.5+. - ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v30", "v31"); -} - -void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 1; - asm volatile( - "movi v31.16b, #3 \n" - - "1: \n" - "ldr q0, [%0], #16 \n" // 0123456789abcdef - "ldr q1, [%1], #16 \n" // 123456789abcdefg - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) - "ushll2 v4.8h, v0.16b, #0 \n" // 89abcdef (16b) - "ushll2 v5.8h, v1.16b, #0 \n" // 9abcdefg (16b) - - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) - "umlal2 v4.8h, v1.16b, v31.16b \n" // 3*near+far (odd) - "umlal2 v5.8h, v0.16b, v31.16b \n" // 3*near+far (even) - - "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) - "rshrn2 v2.16b, v4.8h, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn2 v1.16b, v5.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.16b, v2.16b}, [%2], #32 \n" - "subs %w3, %w3, #32 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 1; - const uint8_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "movi v31.8b, #3 \n" - "movi v30.8h, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 01234567 - "ldr d1, [%2], #8 \n" // 12345678 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) - "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) - "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) - - "rshrn v2.8b, v2.8h, #4 \n" // 2, odd - "rshrn v1.8b, v3.8h, #4 \n" // 2, even - "rshrn v4.8b, v4.8h, #4 \n" // 1, odd - "rshrn v3.8b, v5.8h, #4 \n" // 1, even - - "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 - "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 - "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile ( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "mov v2.16b, v0.16b \n" - "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) - "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) - - "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) - "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "mov v0.16b, v2.16b \n" - "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) - "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) - - "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) - "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "mov v0.16b, v4.16b \n" - "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) - "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) - - "urshr v2.8h, v2.8h, #4 \n" // 2, odd - "urshr v1.8h, v3.8h, #4 \n" // 2, even - "urshr v4.8h, v4.8h, #4 \n" // 1, odd - "urshr v3.8h, v5.8h, #4 \n" // 1, even - - "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 - "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 - - "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v31" // Clobber List - ); -} - -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile ( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) - "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) - "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) - - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) - "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) - - "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far - "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) - "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far - "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) - - "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile ( - "movi v31.4h, #3 \n" - "movi v30.4s, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 0123 (16b) - "ldr d1, [%2], #8 \n" // 1234 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" // 0123 (16b) - "ldr d1, [%3], #8 \n" // 1234 (16b) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) - "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) - "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) - "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) - "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) - "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) - - "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far - - "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 - "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 - - "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 2; - asm volatile ( - "movi v31.8b, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 00112233 (1u1v) - "ldr d1, [%1], #8 \n" // 11223344 (1u1v) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) - "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) - - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) - - "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store - "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 2; - const uint8_t* src_temp1 = src_ptr1 + 2; - - asm volatile ( - "movi v31.8b, #3 \n" - "movi v30.8h, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" - "ldr d1, [%2], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" - "ushll v3.8h, v1.8b, #0 \n" - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "ushll v4.8h, v0.8b, #0 \n" - "ushll v5.8h, v1.8b, #0 \n" - "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) - "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) - - "rshrn v2.8b, v2.8h, #4 \n" // 2, odd - "rshrn v1.8b, v3.8h, #4 \n" // 2, even - "rshrn v4.8b, v4.8h, #4 \n" // 1, odd - "rshrn v3.8b, v5.8h, #4 \n" // 1, even - - "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 - "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 - "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 2; - asm volatile ( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) - "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) - - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) - "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) - "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) - - "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) - "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store - "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store - "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 2; - const uint16_t* src_temp1 = src_ptr1 + 2; - - asm volatile ( - "movi v31.4h, #3 \n" - "movi v30.4s, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" - "ldr d1, [%2], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) - "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) - "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) - "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) - "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) - - "rshrn v1.4h, v2.4s, #4 \n" // 2, odd - "rshrn v0.4h, v3.4s, #4 \n" // 2, even - "rshrn v3.4h, v4.4s, #4 \n" // 1, odd - "rshrn v2.4h, v5.4s, #4 \n" // 1, even - - "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 - "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 - "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -// Add a row of bytes to a row of shorts. Used for box filter. -// Reads 16 bytes and accumulates to 16 shorts at a time. -void ScaleAddRow_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile ( - "1: \n" - "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator - "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes - "uaddw2 v2.8h, v2.8h, v0.16b \n" // add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddw v1.8h, v1.8h, v0.8b \n" - "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator - "subs %w2, %w2, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2" // Clobber List - ); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - "ld2 {v4.b, v5.b}[" #n "], [%6] \n" - -// The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8_t)((int)(a) + -// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) - -void ScaleFilterCols_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_ptr; - int64_t x64 = (int64_t)x; // NOLINT - int64_t dx64 = (int64_t)dx; // NOLINT - asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v3.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v1.4s, v1.4s, v0.4s \n" - // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "add v2.4s, v1.4s, v3.4s \n" - "shl v0.4s, v3.4s, #1 \n" // 8 * dx - "1: \n" - LOAD2_DATA8_LANE(0) - LOAD2_DATA8_LANE(1) - LOAD2_DATA8_LANE(2) - LOAD2_DATA8_LANE(3) - LOAD2_DATA8_LANE(4) - LOAD2_DATA8_LANE(5) - LOAD2_DATA8_LANE(6) - LOAD2_DATA8_LANE(7) - "mov v6.16b, v1.16b \n" - "mov v7.16b, v2.16b \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "ushll v4.8h, v4.8b, #0 \n" - "ushll v5.8h, v5.8b, #0 \n" - "ssubl v16.4s, v5.4h, v4.4h \n" - "ssubl2 v17.4s, v5.8h, v4.8h \n" - "ushll v7.4s, v6.4h, #0 \n" - "ushll2 v6.4s, v6.8h, #0 \n" - "mul v16.4s, v16.4s, v7.4s \n" - "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" - "add v4.8h, v4.8h, v6.8h \n" - "xtn v4.8b, v4.8h \n" - - "st1 {v4.8b}, [%0], #8 \n" // store pixels - "add v1.4s, v1.4s, v0.4s \n" - "add v2.4s, v2.4s, v0.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", - "v4", "v5", "v6", "v7", "v16", "v17" - ); -} - -#undef LOAD2_DATA8_LANE - -void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n" - "subs %w[width], %w[width], #8 \n" - "prfm pldl1keep, [%[src], 448] \n" - "uzp2 v0.4s, v0.4s, v1.4s \n" - "uzp2 v1.4s, v2.4s, v3.4s \n" - "st1 {v0.4s, v1.4s}, [%[dst]], #32 \n" - "b.gt 1b \n" - : [src] "+r"(src_ptr), // %[src] - [dst] "+r"(dst), // %[dst] - [width] "+r"(dst_width) // %[width] - : - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - const uint8_t* src_argb1 = src_argb + 32; - asm volatile( - "1: \n" - "ld2 {v0.4s, v1.4s}, [%[src]] \n" - "add %[src], %[src], #64 \n" - "ld2 {v2.4s, v3.4s}, [%[src1]] \n" - "add %[src1], %[src1], #64 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v1.16b, v2.16b, v3.16b \n" - "subs %w[width], %w[width], #8 \n" - "st1 {v0.16b, v1.16b}, [%[dst]], #32 \n" - "b.gt 1b \n" - : [src] "+r"(src_argb), // %[src] - [src1] "+r"(src_argb1), // %[src1] - [dst] "+r"(dst_argb), // %[dst] - [width] "+r"(dst_width) // %[width] - : - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "1: \n" - "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" - "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" - "uaddl v2.8h, v0.8b, v1.8b \n" - "uaddl2 v3.8h, v0.16b, v1.16b \n" - "uaddl v22.8h, v20.8b, v21.8b \n" - "uaddl2 v23.8h, v20.16b, v21.16b \n" - "add v0.8h, v2.8h, v22.8h \n" - "add v1.8h, v3.8h, v23.8h \n" - "rshrn v0.8b, v0.8h, #2 \n" - "rshrn v1.8b, v1.8h, #2 \n" - "subs %w[width], %w[width], #4 \n" - "stp d0, d1, [%[dst]], #16 \n" - "b.gt 1b \n" - : [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst), - [width] "+r"(dst_width) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v20", "v21", "v22", "v23"); -} - -void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* src_argb1 = src_argb + src_stepx * 4; - const uint8_t* src_argb2 = src_argb + src_stepx * 8; - const uint8_t* src_argb3 = src_argb + src_stepx * 12; - int64_t i = 0; - (void)src_stride; - asm volatile ( - "1: \n" - "ldr w10, [%[src], %[i]] \n" - "ldr w11, [%[src1], %[i]] \n" - "ldr w12, [%[src2], %[i]] \n" - "ldr w13, [%[src3], %[i]] \n" - "add %[i], %[i], %[step] \n" - "subs %w[width], %w[width], #4 \n" - "prfm pldl1keep, [%[src], 448] \n" - "stp w10, w11, [%[dst]], #8 \n" - "stp w12, w13, [%[dst]], #8 \n" - "b.gt 1b \n" - : [src]"+r"(src_argb), - [src1]"+r"(src_argb1), - [src2]"+r"(src_argb2), - [src3]"+r"(src_argb3), - [dst]"+r"(dst_argb), - [width]"+r"(dst_width), - [i]"+r"(i) - : [step]"r"((int64_t)(src_stepx * 16)) - : "memory", "cc", "w10", "w11", "w12", "w13"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -// TODO(Yang Zhang): Might be worth another optimization pass in future. -// It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - asm volatile( - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "zip1 v1.2d, v0.2d, v2.2d \n" - "zip2 v2.2d, v0.2d, v2.2d \n" - "zip1 v5.2d, v4.2d, v6.2d \n" - "zip2 v6.2d, v4.2d, v6.2d \n" - "prfm pldl1keep, [%1, 448] \n" - "add v0.8h, v1.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v5.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn v1.8b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "stp d0, d1, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64_t)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "ld1 {" #vn ".s}[" #n "], [%6] \n" - -void ScaleARGBCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint8_t* src_tmp = src_argb; - int64_t x64 = (int64_t)x; // NOLINT - int64_t dx64 = (int64_t)dx; // NOLINT - int64_t tmp64; - asm volatile ( - "1: \n" - // clang-format off - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - // clang-format on - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1"); -} - -#undef LOAD1_DATA32_LANE - -static const uvec8 kScaleARGBFilterColsShuffleIndices = { - 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, -}; - -#define SCALE_ARGB_FILTER_COLS_STEP_ADDR \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" - -void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int64_t tmp; - const uint8_t* src_tmp = src_argb; - int64_t x64 = (int64_t)x; - int64_t dx64 = (int64_t)dx; - asm volatile( - "dup v0.4s, %w3 \n" - "dup v1.4s, %w4 \n" - "ld1 {v2.4s}, [%[kOffsets]] \n" - "shl v6.4s, v1.4s, #2 \n" - "mul v1.4s, v1.4s, v2.4s \n" - "movi v3.16b, #0x7f \n" - - "add v5.4s, v1.4s, v0.4s \n" - "ldr q18, [%[kIndices]] \n" - - "1: \n" // - SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ldr d1, [%6] \n" // - SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ldr d2, [%6] \n" - "shrn v4.4h, v5.4s, #9 \n" // - SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ld1 {v1.d}[1], [%6] \n" // - SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ld1 {v2.d}[1], [%6] \n" - - "subs %w2, %w2, #4 \n" // 4 processed per loop - "and v4.8b, v4.8b, v3.8b \n" - "trn1 v0.4s, v1.4s, v2.4s \n" - "tbl v4.16b, {v4.16b}, v18.16b \n" // f - "trn2 v1.4s, v1.4s, v2.4s \n" - "eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f - - "umull v16.8h, v1.8b, v4.8b \n" - "umull2 v17.8h, v1.16b, v4.16b \n" - "umlal v16.8h, v0.8b, v7.8b \n" - "umlal2 v17.8h, v0.16b, v7.16b \n" - - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "shrn v0.8b, v16.8h, #7 \n" - "shrn v1.8b, v17.8h, #7 \n" - "add v5.4s, v5.4s, v6.4s \n" - "stp d0, d1, [%0], #16 \n" // store pixels - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : [kIndices] "r"(&kScaleARGBFilterColsShuffleIndices), // %[kIndices] - [kOffsets] "r"(dx_offset) // %[kOffsets] - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19"); -} - -#undef SCALE_ARGB_FILTER_COLS_STEP_ADDR - -// Read 16x2 average down and write 8x1. -void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "1: \n" - "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #8 \n" // 8 processed per loop - "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent - "uaddlp v1.4s, v1.8h \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent - "uadalp v1.4s, v3.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "rshrn v0.4h, v0.4s, #2 \n" // round and pack - "rshrn2 v0.8h, v1.4s, #2 \n" - "st1 {v0.8h}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v1.8h}, [%1], #16 \n" // store 8 UV - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1"); -} - -void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile ( - "1: \n" - "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.8h}, [%1], #16 \n" // store 8 UV - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1"); -} - -void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. - "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 - "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "prfm pldl1keep, [%1, 448] \n" - "rshrn v1.8b, v1.8h, #2 \n" - "st2 {v0.8b,v1.8b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v16", "v17"); -} - -// Reads 4 pixels at a time. -void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, // pixel step - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src1_ptr = src_ptr + src_stepx * 2; - const uint8_t* src2_ptr = src_ptr + src_stepx * 4; - const uint8_t* src3_ptr = src_ptr + src_stepx * 6; - (void)src_stride; - asm volatile ( - "1: \n" - "ld1 {v0.h}[0], [%0], %6 \n" - "ld1 {v1.h}[0], [%1], %6 \n" - "ld1 {v2.h}[0], [%2], %6 \n" - "ld1 {v3.h}[0], [%3], %6 \n" - "subs %w5, %w5, #4 \n" // 4 pixels per loop. - "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src1_ptr), // %1 - "+r"(src2_ptr), // %2 - "+r"(src3_ptr), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_width) // %5 - : "r"((int64_t)(src_stepx * 8)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/drivers/media/pci/tbscapture2/scale_rvv.c b/drivers/media/pci/tbscapture2/scale_rvv.c deleted file mode 100644 index b25429d8106e..000000000000 --- a/drivers/media/pci/tbscapture2/scale_rvv.c +++ /dev/null @@ -1,1921 +0,0 @@ -/* - * Copyright 2023 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -/* - * Copyright (c) 2023 SiFive, Inc. All rights reserved. - * - * Contributed by Darren Hsieh - * Contributed by Bruce Lai - */ - -#include "row.h" -#include "scale_row.h" - -// This module is for clang rvv. GCC hasn't supported segment load & store. -#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \ - defined(__clang__) -#include -#include -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#ifdef HAS_SCALEADDROW_RVV -void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - size_t w = (size_t)src_width; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl); - vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl); - // Use widening multiply-add instead of widening + add - v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl); - __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += vl; - dst_ptr += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEARGBROWDOWN2_RVV -void ScaleARGBRowDown2_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - size_t w = (size_t)dst_width; - const uint64_t* src = (const uint64_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - do { - size_t vl = __riscv_vsetvl_e64m8(w); - vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl); - vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl); - __riscv_vse32_v_u32m4(dst, v_dst, vl); - w -= vl; - src += vl; - dst += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - size_t w = (size_t)dst_width; - const uint32_t* src = (const uint32_t*)(src_argb); - do { - size_t vl = __riscv_vsetvl_e32m4(w); - vuint32m4x2_t v_src = __riscv_vlseg2e32_v_u32m4x2(src, vl); - vuint32m4_t v_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 0); - vuint32m4_t v_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 1); - vuint8m4_t v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32); - vuint8m4_t v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32); - vuint8m4_t v_dst = - __riscv_vaaddu_vv_u8m4(v_even, v_odd, __RISCV_VXRM_RNU, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src += vl * 2; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - size_t w = (size_t)dst_width; - const uint32_t* src = (const uint32_t*)(src_argb); - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m4_t v_odd, v_even, v_dst; - vuint32m4_t v_odd_32, v_even_32; - size_t vl = __riscv_vsetvl_e32m4(w); - __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl); - v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32); - v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32); - // Use round-to-nearest-up mode for averaging add - v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src += vl * 2; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src0 = (const uint32_t*)(src_argb); - const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); - do { - size_t vl = __riscv_vsetvl_e32m4(w); - vuint32m4x2_t v_src0 = __riscv_vlseg2e32_v_u32m4x2(src0, vl); - vuint32m4x2_t v_src1 = __riscv_vlseg2e32_v_u32m4x2(src1, vl); - vuint32m4_t v_row0_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0); - vuint32m4_t v_row0_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1); - vuint32m4_t v_row1_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0); - vuint32m4_t v_row1_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1); - vuint8m4_t v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32); - vuint8m4_t v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32); - vuint8m4_t v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32); - vuint8m4_t v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32); - vuint16m8_t v_row0_sum = - __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4); - vuint16m8_t v_row1_sum = - __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4); - vuint16m8_t v_dst_16 = - __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); - vuint8m4_t v_dst = - __riscv_vnclipu_wx_u8m4(v_dst_16, 2, __RISCV_VXRM_RNU, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src0 += vl * 2; - src1 += vl * 2; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src0 = (const uint32_t*)(src_argb); - const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; - vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; - vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32; - size_t vl = __riscv_vsetvl_e32m4(w); - __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl); - __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl); - v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32); - v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32); - v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32); - v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32); - v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4); - v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4); - v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); - // Use round-to-nearest-up mode for vnclip - v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src0 += vl * 2; - src1 += vl * 2; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEARGBROWDOWNEVEN_RVV -void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - const int stride_byte = src_stepx * 4; - do { - size_t vl = __riscv_vsetvl_e32m8(w); - vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl); - __riscv_vse32_v_u32m8(dst, v_row, vl); - w -= vl; - src += vl * src_stepx; - dst += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src0 = (const uint32_t*)(src_argb); - const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); - const int stride_byte = src_stepx * 4; - do { - size_t vl = __riscv_vsetvl_e32m4(w); - vuint32m4x2_t v_src0 = __riscv_vlsseg2e32_v_u32m4x2(src0, stride_byte, vl); - vuint32m4x2_t v_src1 = __riscv_vlsseg2e32_v_u32m4x2(src1, stride_byte, vl); - vuint32m4_t v_row0_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0); - vuint32m4_t v_row0_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1); - vuint32m4_t v_row1_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0); - vuint32m4_t v_row1_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1); - vuint8m4_t v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32); - vuint8m4_t v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32); - vuint8m4_t v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32); - vuint8m4_t v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32); - vuint16m8_t v_row0_sum = - __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4); - vuint16m8_t v_row1_sum = - __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4); - vuint16m8_t v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); - vuint8m4_t v_dst = - __riscv_vnclipu_wx_u8m4(v_sum, 2, __RISCV_VXRM_RNU, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src0 += vl * src_stepx; - src1 += vl * src_stepx; - dst_argb += vl * 4; - } while (w > 0); -} -#else -void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src0 = (const uint32_t*)(src_argb); - const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); - const int stride_byte = src_stepx * 4; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; - vuint16m8_t v_row0_sum, v_row1_sum, v_sum; - vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32; - size_t vl = __riscv_vsetvl_e32m4(w); - __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0, - stride_byte, vl); - __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1, - stride_byte, vl); - v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32); - v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32); - v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32); - v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32); - v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4); - v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4); - v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); - // Use round-to-nearest-up mode for vnclip - v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4); - __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); - w -= vl; - src0 += vl * src_stepx; - src1 += vl * src_stepx; - dst_argb += vl * 4; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN2_RVV -void ScaleRowDown2_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - size_t w = (size_t)dst_width; - const uint16_t* src = (const uint16_t*)src_ptr; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e16m8(w); - vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl); - vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl); - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - src += vl; - dst += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEROWDOWN2LINEAR_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - size_t w = (size_t)dst_width; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_ptr, vl); - vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_src, 0); - vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_src, 1); - vuint8m4_t v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, __RISCV_VXRM_RNU, vl); - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - src_ptr += 2 * vl; - dst += vl; - } while (w > 0); -} -#else -void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - size_t w = (size_t)dst_width; - (void)src_stride; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m4_t v_s0, v_s1, v_dst; - size_t vl = __riscv_vsetvl_e8m4(w); - __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl); - // Use round-to-nearest-up mode for averaging add - v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl); - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - src_ptr += 2 * vl; - dst += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN2BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - size_t w = (size_t)dst_width; - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4x2_t v_s = __riscv_vlseg2e8_v_u8m4x2(s, vl); - vuint8m4x2_t v_t = __riscv_vlseg2e8_v_u8m4x2(t, vl); - vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_s, 0); - vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_s, 1); - vuint8m4_t v_t0 = __riscv_vget_v_u8m4x2_u8m4(v_t, 0); - vuint8m4_t v_t1 = __riscv_vget_v_u8m4x2_u8m4(v_t, 1); - vuint16m8_t v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl); - vuint16m8_t v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl); - vuint16m8_t v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl); - // Use round-to-nearest-up mode for vnclip - vuint8m4_t v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, __RISCV_VXRM_RNU, vl); - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - s += 2 * vl; - t += 2 * vl; - dst += vl; - } while (w > 0); -} -#else -void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - size_t w = (size_t)dst_width; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - size_t vl = __riscv_vsetvl_e8m4(w); - vuint8m4_t v_s0, v_s1, v_t0, v_t1; - vuint16m8_t v_s01, v_t01, v_st01; - vuint8m4_t v_dst; - __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl); - __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl); - v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl); - v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl); - v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl); - // Use round-to-nearest-up mode for vnclip - v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl); - __riscv_vse8_v_u8m4(dst, v_dst, vl); - w -= vl; - s += 2 * vl; - t += 2 * vl; - dst += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN4_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowDown4_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); - vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); - __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl); - w -= vl; - src_ptr += (4 * vl); - dst_ptr += vl; - } while (w > 0); -} -#else -void ScaleRowDown4_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_s0, v_s1, v_s2, v_s3; - __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); - __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl); - w -= vl; - src_ptr += (4 * vl); - dst_ptr += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN4BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - size_t w = (size_t)dst_width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); - vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); - vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); - vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); - vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); - vuint16m4_t v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl); - vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_ptr1, vl); - vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); - vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); - vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); - vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); - vuint16m4_t v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl); - vuint8m2x4_t v_u = __riscv_vlseg4e8_v_u8m2x4(src_ptr2, vl); - vuint8m2_t v_u0 = __riscv_vget_v_u8m2x4_u8m2(v_u, 0); - vuint8m2_t v_u1 = __riscv_vget_v_u8m2x4_u8m2(v_u, 1); - vuint8m2_t v_u2 = __riscv_vget_v_u8m2x4_u8m2(v_u, 2); - vuint8m2_t v_u3 = __riscv_vget_v_u8m2x4_u8m2(v_u, 3); - vuint16m4_t v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl); - vuint16m4_t v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl); - vuint16m4_t v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl); - vuint16m4_t v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl); - vuint16m4_t v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl); - vuint16m4_t v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl); - vuint8m2x4_t v_v = __riscv_vlseg4e8_v_u8m2x4(src_ptr3, vl); - vuint8m2_t v_v0 = __riscv_vget_v_u8m2x4_u8m2(v_v, 0); - vuint8m2_t v_v1 = __riscv_vget_v_u8m2x4_u8m2(v_v, 1); - vuint8m2_t v_v2 = __riscv_vget_v_u8m2x4_u8m2(v_v, 2); - vuint8m2_t v_v3 = __riscv_vget_v_u8m2x4_u8m2(v_v, 3); - - vuint16m4_t v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl); - vuint16m4_t v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl); - - vuint16m4_t v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl); - vuint16m4_t v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl); - - vuint16m4_t v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl); - vuint16m4_t v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl); - vuint16m4_t v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl); - vuint8m2_t v_dst = - __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, __RISCV_VXRM_RNU, vl); - __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 4 * vl; - src_ptr1 += 4 * vl; - src_ptr2 += 4 * vl; - src_ptr3 += 4 * vl; - dst_ptr += vl; - } while (w > 0); -} -#else -void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - size_t w = (size_t)dst_width; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m2_t v_s0, v_s1, v_s2, v_s3; - vuint8m2_t v_t0, v_t1, v_t2, v_t3; - vuint8m2_t v_u0, v_u1, v_u2, v_u3; - vuint8m2_t v_v0, v_v1, v_v2, v_v3; - vuint16m4_t v_s01, v_s23, v_t01, v_t23; - vuint16m4_t v_u01, v_u23, v_v01, v_v23; - vuint16m4_t v_st01, v_st23, v_uv01, v_uv23; - vuint16m4_t v_st0123, v_uv0123, v_stuv0123; - vuint8m2_t v_dst; - size_t vl = __riscv_vsetvl_e8m2(w); - - __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); - v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl); - - __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl); - v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl); - - __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl); - v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl); - v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl); - - v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl); - v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl); - v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl); - v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl); - - __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl); - - v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl); - v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl); - - v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl); - v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl); - - v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl); - v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl); - v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl); - // Use round-to-nearest-up mode for vnclip - v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl); - __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 4 * vl; - src_ptr1 += 4 * vl; - src_ptr2 += 4 * vl; - src_ptr3 += 4 * vl; - dst_ptr += vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN34_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowDown34_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); - vuint8m2_t v_0 = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); - vuint8m2_t v_1 = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); - vuint8m2_t v_3 = __riscv_vget_v_u8m2x4_u8m2(v_src, 3); - vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_0, v_1, v_3); - __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown34_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2_t v_s0, v_s1, v_s2, v_s3; - __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl); - __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl); - w -= vl; - src_ptr += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - do { - vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; - vuint8m2_t v_u0, v_u1, v_u2, v_u3; - vuint16m4_t v_u1_u16; - vuint8m2_t v_a0, v_a1, v_a2; - vuint8m2x3_t v_dst; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl); - vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); - vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); - vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); - vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); - - if (src_stride == 0) { - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl); - v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl); - } else { - vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl); - vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); - vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); - vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); - vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl); - v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl); - v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl); - t += 4 * vl; - } - - v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl); - v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl); - v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl); - v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl); - - v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, __RISCV_VXRM_RNU, vl); - v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, __RISCV_VXRM_RNU, vl); - v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, __RISCV_VXRM_RNU, vl); - v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, __RISCV_VXRM_RNU, vl); - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl); - v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, __RISCV_VXRM_RNU, vl); - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl); - v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); - - v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2); - __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); - - w -= vl; - s += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m2_t v_s0, v_s1, v_s2, v_s3; - vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; - vuint8m2_t v_u0, v_u1, v_u2, v_u3; - vuint16m4_t v_u1_u16; - vuint8m2_t v_a0, v_a1, v_a2; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl); - - if (src_stride == 0) { - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl); - v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl); - } else { - vuint8m2_t v_t0, v_t1, v_t2, v_t3; - __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl); - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl); - v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl); - v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl); - t += 4 * vl; - } - - v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl); - v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl); - v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl); - v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl); - - // Use round-to-nearest-up mode for vnclip & averaging add - v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl); - v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl); - v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl); - v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl); - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl); - v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl); - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl); - v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); - - __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl); - - w -= vl; - s += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - do { - vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; - vuint16m4_t v_u1_u16; - vuint8m2_t v_a0, v_a1, v_a2; - vuint8m2x3_t v_dst; - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl); - vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); - vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); - vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); - vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); - - // Use round-to-nearest-up mode for vnclip & averaging add - if (src_stride == 0) { - v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, __RISCV_VXRM_RNU, vl); - v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, __RISCV_VXRM_RNU, vl); - v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, __RISCV_VXRM_RNU, vl); - v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, __RISCV_VXRM_RNU, vl); - } else { - vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl); - vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); - vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); - vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); - vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); - v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, __RISCV_VXRM_RNU, vl); - v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, __RISCV_VXRM_RNU, vl); - v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, __RISCV_VXRM_RNU, vl); - v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, __RISCV_VXRM_RNU, vl); - t += 4 * vl; - } - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl); - v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, __RISCV_VXRM_RNU, vl); - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl); - v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); - - v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2); - __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); - - w -= vl; - s += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m2_t v_s0, v_s1, v_s2, v_s3; - vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; - vuint16m4_t v_u1_u16; - vuint8m2_t v_a0, v_a1, v_a2; - size_t vl = __riscv_vsetvl_e8m2(w); - __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl); - - // Use round-to-nearest-up mode for vnclip & averaging add - if (src_stride == 0) { - v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl); - v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl); - v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl); - v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl); - } else { - vuint8m2_t v_t0, v_t1, v_t2, v_t3; - __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl); - v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl); - v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl); - v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl); - v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl); - t += 4 * vl; - } - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl); - v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl); - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl); - v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl); - v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl); - - __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl); - - w -= vl; - s += 4 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN38_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowDown38_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - (void)src_stride; - assert(dst_width % 3 == 0); - do { - size_t vl = __riscv_vsetvl_e8m1(w); - vuint8m1x8_t v_src = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); - vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_src, 0); - vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_src, 3); - vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_src, 6); - vuint8m1x3_t v_dst = __riscv_vcreate_v_u8m1x3(v_s0, v_s3, v_s6); - __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown38_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - (void)src_stride; - assert(dst_width % 3 == 0); - do { - vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; - size_t vl = __riscv_vsetvl_e8m1(w); - __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, - &v_s7, src_ptr, vl); - __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint16_t coeff_a = (65536u / 6u); - const uint16_t coeff_b = (65536u / 4u); - assert((dst_width % 3 == 0) && (dst_width > 0)); - do { - vuint16m2_t v_e, v_f, v_g; - vuint8m1_t v_dst_e, v_dst_f, v_dst_g; - vuint8m1x3_t v_dst; - size_t vl = __riscv_vsetvl_e8m1(w); - // s: e00, e10, e20, f00, f10, f20, g00, g10 - vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); - vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0); - vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1); - vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2); - vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3); - vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4); - vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5); - vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6); - vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7); - // t: e01, e11, e21, f01, f11, f21, g01, g11 - vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl); - vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0); - vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1); - vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2); - vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3); - vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4); - vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5); - vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6); - vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7); - // Calculate sum of [e00, e21] to v_e - // Calculate sum of [f00, f21] to v_f - // Calculate sum of [g00, g11] to v_g - vuint16m2_t v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); - vuint16m2_t v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); - vuint16m2_t v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); - vuint16m2_t v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); - vuint16m2_t v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); - vuint16m2_t v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); - vuint16m2_t v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); - vuint16m2_t v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); - - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); - v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); - v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); - v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); - - // Average in 16-bit fixed-point - v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); - v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); - v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); - v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); - v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); - v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); - - v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g); - __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint16_t coeff_a = (65536u / 6u); - const uint16_t coeff_b = (65536u / 4u); - assert((dst_width % 3 == 0) && (dst_width > 0)); - do { - vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; - vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; - vuint16m2_t v_e0, v_e1, v_e2, v_e; - vuint16m2_t v_f0, v_f1, v_f2, v_f; - vuint16m2_t v_g0, v_g1, v_g; - vuint8m1_t v_dst_e, v_dst_f, v_dst_g; - size_t vl = __riscv_vsetvl_e8m1(w); - // s: e00, e10, e20, f00, f10, f20, g00, g10 - // t: e01, e11, e21, f01, f11, f21, g01, g11 - __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, - &v_s7, src_ptr, vl); - __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, - &v_t7, src_ptr + src_stride, vl); - // Calculate sum of [e00, e21] to v_e - // Calculate sum of [f00, f21] to v_f - // Calculate sum of [g00, g11] to v_g - v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); - v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); - v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); - v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); - v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); - v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); - v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); - v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); - - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); - v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); - v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); - v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); - - // Average in 16-bit fixed-point - v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); - v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); - v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); - - v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); - v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); - v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); - - __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint16_t coeff_a = (65536u / 9u); - const uint16_t coeff_b = (65536u / 6u); - assert((dst_width % 3 == 0) && (dst_width > 0)); - do { - vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e; - vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f; - vuint16m2_t v_g0, v_g1, v_g2, v_g; - vuint8m1_t v_dst_e, v_dst_f, v_dst_g; - vuint8m1x3_t v_dst; - size_t vl = __riscv_vsetvl_e8m1(w); - // s: e00, e10, e20, f00, f10, f20, g00, g10 - vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); - vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0); - vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1); - vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2); - vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3); - vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4); - vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5); - vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6); - vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7); - // t: e01, e11, e21, f01, f11, f21, g01, g11 - vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl); - vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0); - vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1); - vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2); - vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3); - vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4); - vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5); - vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6); - vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7); - // u: e02, e12, e22, f02, f12, f22, g02, g12 - vuint8m1x8_t v_u = __riscv_vlseg8e8_v_u8m1x8(src_ptr + 2 * src_stride, vl); - vuint8m1_t v_u0 = __riscv_vget_v_u8m1x8_u8m1(v_u, 0); - vuint8m1_t v_u1 = __riscv_vget_v_u8m1x8_u8m1(v_u, 1); - vuint8m1_t v_u2 = __riscv_vget_v_u8m1x8_u8m1(v_u, 2); - vuint8m1_t v_u3 = __riscv_vget_v_u8m1x8_u8m1(v_u, 3); - vuint8m1_t v_u4 = __riscv_vget_v_u8m1x8_u8m1(v_u, 4); - vuint8m1_t v_u5 = __riscv_vget_v_u8m1x8_u8m1(v_u, 5); - vuint8m1_t v_u6 = __riscv_vget_v_u8m1x8_u8m1(v_u, 6); - vuint8m1_t v_u7 = __riscv_vget_v_u8m1x8_u8m1(v_u, 7); - // Calculate sum of [e00, e22] - v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); - v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); - v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); - v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl); - v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl); - - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); - v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl); - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl); - v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); - // Calculate sum of [f00, f22] - v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); - v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); - v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); - v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl); - v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl); - - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); - v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl); - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl); - v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); - // Calculate sum of [g00, g12] - v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); - v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); - v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl); - - v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); - v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl); - - // Average in 16-bit fixed-point - v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); - v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); - v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); - v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); - v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); - v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); - - v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g); - __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#else -void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - size_t w = (size_t)dst_width / 3u; - const uint16_t coeff_a = (65536u / 9u); - const uint16_t coeff_b = (65536u / 6u); - assert((dst_width % 3 == 0) && (dst_width > 0)); - do { - vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7; - vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7; - vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7; - vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e; - vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f; - vuint16m2_t v_g0, v_g1, v_g2, v_g; - vuint8m1_t v_dst_e, v_dst_f, v_dst_g; - size_t vl = __riscv_vsetvl_e8m1(w); - // s: e00, e10, e20, f00, f10, f20, g00, g10 - // t: e01, e11, e21, f01, f11, f21, g01, g11 - // u: e02, e12, e22, f02, f12, f22, g02, g12 - __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6, - &v_s7, src_ptr, vl); - __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6, - &v_t7, src_ptr + src_stride, vl); - __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6, - &v_u7, src_ptr + 2 * src_stride, vl); - // Calculate sum of [e00, e22] - v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); - v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); - v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); - v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl); - v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl); - - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); - v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl); - v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl); - v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); - // Calculate sum of [f00, f22] - v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); - v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); - v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); - v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl); - v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl); - - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); - v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl); - v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl); - v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); - // Calculate sum of [g00, g12] - v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); - v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); - v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl); - - v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); - v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl); - - // Average in 16-bit fixed-point - v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); - v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); - v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); - - v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); - v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); - v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); - __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl); - w -= vl; - src_ptr += 8 * vl; - dst_ptr += 3 * vl; - } while (w > 0); -} -#endif -#endif - -// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' -// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other -// platforms only implement non-edge part of image and process edge with scalar. - -#ifdef HAS_SCALEROWUP2_LINEAR_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - size_t work_width = (size_t)dst_width - 1u; - size_t src_width = work_width >> 1u; - const uint8_t* work_src_ptr = src_ptr; - uint8_t* work_dst_ptr = dst_ptr + 1; - size_t vl = __riscv_vsetvlmax_e8m4(); - vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl); - dst_ptr[0] = src_ptr[0]; - while (src_width > 0) { - vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even; - vuint16m8_t v_src0_u16, v_src1_u16; - vuint8m4x2_t v_dst; - size_t vl = __riscv_vsetvl_e8m4(src_width); - v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); - v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl); - - v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl); - v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl); - v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl); - v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl); - - v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl); - v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl); - - v_dst = __riscv_vcreate_v_u8m4x2(v_dst_even, v_dst_odd); - __riscv_vsseg2e8_v_u8m4x2(work_dst_ptr, v_dst, vl); - - src_width -= vl; - work_src_ptr += vl; - work_dst_ptr += 2 * vl; - } - dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; -} -#else -void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - size_t work_width = (size_t)dst_width - 1u; - size_t src_width = work_width >> 1u; - const uint8_t* work_src_ptr = src_ptr; - uint8_t* work_dst_ptr = dst_ptr + 1; - size_t vl = __riscv_vsetvlmax_e8m4(); - vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl); - dst_ptr[0] = src_ptr[0]; - while (src_width > 0) { - vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even; - vuint16m8_t v_src0_u16, v_src1_u16; - size_t vl = __riscv_vsetvl_e8m4(src_width); - v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); - v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl); - - v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl); - v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl); - v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl); - v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl); - - v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl); - v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl); - - __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl); - - src_width -= vl; - work_src_ptr += vl; - work_dst_ptr += 2 * vl; - } - dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; -} -#endif -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - size_t src_width = work_width >> 1u; - const uint8_t* work_s = src_ptr; - const uint8_t* work_t = src_ptr + src_stride; - const uint8_t* s = work_s; - const uint8_t* t = work_t; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - uint8_t* work_d = d + 1; - uint8_t* work_e = e + 1; - size_t vl = __riscv_vsetvlmax_e16m4(); - vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); - vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); - d[0] = (3 * s[0] + t[0] + 2) >> 2; - e[0] = (s[0] + 3 * t[0] + 2) >> 2; - while (src_width > 0) { - vuint8m2_t v_s0, v_s1, v_t0, v_t1; - vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; - vuint16m4_t v_t0_u16_, v_t1_u16_; - vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; - vuint8m2x2_t v_dst0, v_dst1; - size_t vl = __riscv_vsetvl_e8m2(src_width); - v_s0 = __riscv_vle8_v_u8m2(work_s, vl); - v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl); - - v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); - v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); - - v_t0 = __riscv_vle8_v_u8m2(work_t, vl); - v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl); - - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); - v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); - v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); - - v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); - v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); - - v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); - v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); - v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); - v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); - - v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); - v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); - v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); - v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); - - v_dst0 = __riscv_vcreate_v_u8m2x2(v_dst0_even, v_dst0_odd); - __riscv_vsseg2e8_v_u8m2x2(work_d, v_dst0, vl); - v_dst1 = __riscv_vcreate_v_u8m2x2(v_dst1_even, v_dst1_odd); - __riscv_vsseg2e8_v_u8m2x2(work_e, v_dst1, vl); - src_width -= vl; - work_s += vl; - work_t += vl; - work_d += 2 * vl; - work_e += 2 * vl; - } - d[dst_width - 1] = - (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2; - e[dst_width - 1] = - (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2; -} -#else -void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - size_t src_width = work_width >> 1u; - const uint8_t* work_s = src_ptr; - const uint8_t* work_t = src_ptr + src_stride; - const uint8_t* s = work_s; - const uint8_t* t = work_t; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - uint8_t* work_d = d + 1; - uint8_t* work_e = e + 1; - size_t vl = __riscv_vsetvlmax_e16m4(); - vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); - vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); - d[0] = (3 * s[0] + t[0] + 2) >> 2; - e[0] = (s[0] + 3 * t[0] + 2) >> 2; - while (src_width > 0) { - vuint8m2_t v_s0, v_s1, v_t0, v_t1; - vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; - vuint16m4_t v_t0_u16_, v_t1_u16_; - vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; - size_t vl = __riscv_vsetvl_e8m2(src_width); - v_s0 = __riscv_vle8_v_u8m2(work_s, vl); - v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl); - - v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); - v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); - - v_t0 = __riscv_vle8_v_u8m2(work_t, vl); - v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl); - - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); - v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); - v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); - - v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); - v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); - - v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); - v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); - v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); - v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); - - v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); - v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); - v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); - v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); - - __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl); - __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl); - - src_width -= vl; - work_s += vl; - work_t += vl; - work_d += 2 * vl; - work_e += 2 * vl; - } - d[dst_width - 1] = - (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2; - e[dst_width - 1] = - (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2; -} -#endif -#endif - -#ifdef HAS_SCALEUVROWDOWN2_RVV -void ScaleUVRowDown2_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - size_t w = (size_t)dst_width; - const uint32_t* src = (const uint32_t*)src_uv; - uint16_t* dst = (uint16_t*)dst_uv; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e32m8(w); - vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl); - vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl); - __riscv_vse16_v_u16m4(dst, v_u1v1, vl); - w -= vl; - src += vl; - dst += vl; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - size_t w = (size_t)dst_width; - const uint16_t* src = (const uint16_t*)src_uv; - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e16m4(w); - vuint16m4x2_t v_src = __riscv_vlseg2e16_v_u16m4x2(src, vl); - vuint16m4_t v_u0v0_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 0); - vuint16m4_t v_u1v1_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 1); - vuint8m4_t v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16); - vuint8m4_t v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16); - vuint8m4_t v_avg = - __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, __RISCV_VXRM_RNU, vl * 2); - __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2); - w -= vl; - src += vl * 2; - dst_uv += vl * 2; - } while (w > 0); -} -#else -void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - size_t w = (size_t)dst_width; - const uint16_t* src = (const uint16_t*)src_uv; - (void)src_stride; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m4_t v_u0v0, v_u1v1, v_avg; - vuint16m4_t v_u0v0_16, v_u1v1_16; - size_t vl = __riscv_vsetvl_e16m4(w); - __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl); - v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16); - v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16); - // Use round-to-nearest-up mode for averaging add - v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2); - __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2); - w -= vl; - src += vl * 2; - dst_uv += vl * 2; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEUVROWDOWN2BOX_RVV -#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) -void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - const uint8_t* src_uv_row1 = src_uv + src_stride; - size_t w = (size_t)dst_width; - do { - size_t vl = __riscv_vsetvl_e8m2(w); - vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_uv, vl); - vuint8m2_t v_u0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); - vuint8m2_t v_v0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); - vuint8m2_t v_u1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); - vuint8m2_t v_v1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); - vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_uv_row1, vl); - vuint8m2_t v_u0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); - vuint8m2_t v_v0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); - vuint8m2_t v_u1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); - vuint8m2_t v_v1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); - - vuint16m4_t v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl); - vuint16m4_t v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl); - vuint16m4_t v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl); - vuint16m4_t v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl); - vuint16m4_t v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl); - vuint16m4_t v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl); - vuint8m2_t v_dst_u = - __riscv_vnclipu_wx_u8m2(v_sum0, 2, __RISCV_VXRM_RNU, vl); - vuint8m2_t v_dst_v = - __riscv_vnclipu_wx_u8m2(v_sum1, 2, __RISCV_VXRM_RNU, vl); - - vuint8m2x2_t v_dst_uv = __riscv_vcreate_v_u8m2x2(v_dst_u, v_dst_v); - __riscv_vsseg2e8_v_u8m2x2(dst_uv, v_dst_uv, vl); - - dst_uv += 2 * vl; - src_uv += 4 * vl; - w -= vl; - src_uv_row1 += 4 * vl; - } while (w > 0); -} -#else -void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - const uint8_t* src_uv_row1 = src_uv + src_stride; - size_t w = (size_t)dst_width; - // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode - // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); - do { - vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; - vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1; - vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1; - vuint16m4_t v_sum0, v_sum1; - vuint8m2_t v_dst_u, v_dst_v; - size_t vl = __riscv_vsetvl_e8m2(w); - - __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0, - src_uv, vl); - __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1, - src_uv_row1, vl); - - v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl); - v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl); - v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl); - v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl); - - v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl); - v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl); - // Use round-to-nearest-up mode for vnclip - v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl); - v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl); - - __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl); - - dst_uv += 2 * vl; - src_uv += 4 * vl; - w -= vl; - src_uv_row1 += 4 * vl; - } while (w > 0); -} -#endif -#endif - -#ifdef HAS_SCALEUVROWDOWN4_RVV -void ScaleUVRowDown4_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width) { - // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2. - // dst_width = src_width / 4 and src_width is also int. - size_t w = (size_t)dst_width * 8; - (void)src_stride; - (void)src_stepx; - do { - size_t vl = __riscv_vsetvl_e8m8(w); - vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl); - vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row); - // Narrowing without clipping - vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8); - vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8); - vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16); - __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4); - w -= vl; - src_uv += vl; - dst_uv += vl / 4; - } while (w > 0); -} -#endif - -#ifdef HAS_SCALEUVROWDOWNEVEN_RVV -void ScaleUVRowDownEven_RVV(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width) { - size_t w = (size_t)dst_width; - const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2; - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - (void)src_stride; - do { - size_t vl = __riscv_vsetvl_e16m8(w); - vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl); - __riscv_vse16_v_u16m8(dst, v_row, vl); - w -= vl; - src += vl * src_stepx; - dst += vl; - } while (w > 0); -} -#endif - -// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' -// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. -// Other platforms only implement non-edge part of image and process edge with -// scalar. - -#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1; - const uint8_t* work_src_ptr = src_ptr; - size_t vl = __riscv_vsetvlmax_e8m4(); - vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl); - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - while (work_width > 0) { - vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8; - vuint16m4_t v_dst_odd, v_dst_even; - vuint16m8_t v_uv0_u16, v_uv1_u16; - vuint16m4x2_t v_dst; - size_t vl = __riscv_vsetvl_e8m4(work_width); - v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); - v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl); - - v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl); - v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl); - - v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl); - v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl); - - v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl); - v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl); - - v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8); - v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8); - - v_dst = __riscv_vcreate_v_u16m4x2(v_dst_even, v_dst_odd); - __riscv_vsseg2e16_v_u16m4x2(work_dst_ptr, v_dst, vl / 2); - - work_width -= vl; - work_src_ptr += vl; - work_dst_ptr += vl; - } - dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; - dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; -} -#else -void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1; - const uint8_t* work_src_ptr = src_ptr; - size_t vl = __riscv_vsetvlmax_e8m4(); - vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl); - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - while (work_width > 0) { - vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8; - vuint16m4_t v_dst_odd, v_dst_even; - vuint16m8_t v_uv0_u16, v_uv1_u16; - size_t vl = __riscv_vsetvl_e8m4(work_width); - v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); - v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl); - - v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl); - v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl); - - v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl); - v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl); - - v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl); - v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl); - - v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8); - v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8); - - __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2); - - work_width -= vl; - work_src_ptr += vl; - work_dst_ptr += vl; - } - dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; - dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; -} -#endif -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV -#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE -void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - const uint8_t* work_s = src_ptr; - const uint8_t* work_t = src_ptr + src_stride; - const uint8_t* s = work_s; - const uint8_t* t = work_t; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - uint16_t* work_d = (uint16_t*)d + 1; - uint16_t* work_e = (uint16_t*)e + 1; - size_t vl = __riscv_vsetvlmax_e16m4(); - vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); - vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); - d[0] = (3 * s[0] + t[0] + 2) >> 2; - e[0] = (s[0] + 3 * t[0] + 2) >> 2; - d[1] = (3 * s[1] + t[1] + 2) >> 2; - e[1] = (s[1] + 3 * t[1] + 2) >> 2; - while (work_width > 0) { - vuint8m2_t v_s0, v_s1, v_t0, v_t1; - vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; - vuint16m4_t v_t0_u16_, v_t1_u16_; - vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8; - vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; - vuint16m2x2_t v_dst0, v_dst1; - size_t vl = __riscv_vsetvl_e8m2(work_width); - v_s0 = __riscv_vle8_v_u8m2(work_s, vl); - v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl); - - v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); - v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); - - v_t0 = __riscv_vle8_v_u8m2(work_t, vl); - v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl); - - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); - v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); - v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); - - v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); - v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); - - v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); - v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); - v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); - v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); - - v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); - v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); - v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); - v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); - - v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8); - v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8); - v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8); - v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8); - - v_dst0 = __riscv_vcreate_v_u16m2x2(v_dst0_even, v_dst0_odd); - __riscv_vsseg2e16_v_u16m2x2(work_d, v_dst0, vl / 2); - v_dst1 = __riscv_vcreate_v_u16m2x2(v_dst1_even, v_dst1_odd); - __riscv_vsseg2e16_v_u16m2x2(work_e, v_dst1, vl / 2); - - work_width -= vl; - work_s += vl; - work_t += vl; - work_d += vl; - work_e += vl; - } - d[2 * dst_width - 2] = - (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >> - 2; - e[2 * dst_width - 2] = - (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >> - 2; - d[2 * dst_width - 1] = - (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >> - 2; - e[2 * dst_width - 1] = - (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >> - 2; -} -#else -void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - size_t work_width = ((size_t)dst_width - 1u) & ~1u; - const uint8_t* work_s = src_ptr; - const uint8_t* work_t = src_ptr + src_stride; - const uint8_t* s = work_s; - const uint8_t* t = work_t; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - uint16_t* work_d = (uint16_t*)d + 1; - uint16_t* work_e = (uint16_t*)e + 1; - size_t vl = __riscv_vsetvlmax_e16m4(); - vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); - vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); - d[0] = (3 * s[0] + t[0] + 2) >> 2; - e[0] = (s[0] + 3 * t[0] + 2) >> 2; - d[1] = (3 * s[1] + t[1] + 2) >> 2; - e[1] = (s[1] + 3 * t[1] + 2) >> 2; - while (work_width > 0) { - vuint8m2_t v_s0, v_s1, v_t0, v_t1; - vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; - vuint16m4_t v_t0_u16_, v_t1_u16_; - vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8; - vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; - size_t vl = __riscv_vsetvl_e8m2(work_width); - v_s0 = __riscv_vle8_v_u8m2(work_s, vl); - v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl); - - v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); - v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); - v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); - v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); - - v_t0 = __riscv_vle8_v_u8m2(work_t, vl); - v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl); - - v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); - v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); - v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); - v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); - - v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); - v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); - - v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); - v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); - v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); - v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); - - v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); - v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); - v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); - v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); - - v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8); - v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8); - v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8); - v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8); - - __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2); - __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2); - - work_width -= vl; - work_s += vl; - work_t += vl; - work_d += vl; - work_e += vl; - } - d[2 * dst_width - 2] = - (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >> - 2; - e[2 * dst_width - 2] = - (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >> - 2; - d[2 * dst_width - 1] = - (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >> - 2; - e[2 * dst_width - 1] = - (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >> - 2; -} -#endif -#endif - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && - // defined(__clang__) diff --git a/drivers/media/pci/tbscapture2/scale_win.c b/drivers/media/pci/tbscapture2/scale_win.c deleted file mode 100644 index 0c11254e56ba..000000000000 --- a/drivers/media/pci/tbscapture2/scale_win.c +++ /dev/null @@ -1,1392 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" -#include "scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -// Offsets for source bytes 0 to 9 -static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 0 to 10 -static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, - 8, 9, 9, 10, 10, 11, 12, 13}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; - -// Coefficients for source bytes 0 to 10 -static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; - -// Coefficients for source bytes 10 to 21 -static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; - -// Coefficients for source bytes 21 to 31 -static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; - -// Coefficients for source bytes 21 to 31 -static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; - -static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 0,1,2 -static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 3,4,5 -static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x3 and 2x3 -static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; - -// Arrange first value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; - -// Arrange second value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; - -// Arrange third value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x2 and 2x2 -static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; - -// Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add - paddw xmm1, xmm3 - psrlw xmm0, 1 - psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN2_AVX2 -// Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x1 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// For rounding, average = (sum + 2) / 4 -// becomes average((sum >> 1), 0) -// Blends 64x2 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add - vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 - vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN2_AVX2 - -// Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - ret - } -} - -// Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - movdqa xmm5, xmm4 - packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 - - wloop: - movdqu xmm0, [eax] // average rows - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 - paddw xmm1, xmm3 - movdqu xmm2, [eax + esi * 2] - movdqu xmm3, [eax + esi * 2 + 16] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 - paddw xmm1, xmm3 - movdqu xmm2, [eax + edi] - movdqu xmm3, [eax + edi + 16] - lea eax, [eax + 32] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 - paddw xmm1, xmm3 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN4_AVX2 -// Point samples 64 pixels to 16 pixels. -__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 - vpsrld ymm5, ymm5, 24 - vpslld ymm5, ymm5, 16 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x4 rectangle to 16x1. -__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 - vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 - vpackuswb ymm4, ymm4, ymm4 - - wloop: - vmovdqu ymm0, [eax] // average rows - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + esi * 2] - vmovdqu ymm3, [eax + esi * 2 + 32] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + edi] - vmovdqu ymm3, [eax + edi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 - vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN4_AVX2 - -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, xmmword ptr kShuf0 - movdqa xmm4, xmmword ptr kShuf1 - movdqa xmm5, xmmword ptr kShuf2 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 kRound34 - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, xmmword ptr kShuf38a - movdqa xmm5, xmmword ptr kShuf38b - - xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - sub ecx, 12 - jg xloop - - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAc - movdqa xmm3, xmmword ptr kShufAc3 - movdqa xmm4, xmmword ptr kScaleAc33 - pxor xmm5, xmm5 - - xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqu xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqu xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAb0 - movdqa xmm3, xmmword ptr kShufAb1 - movdqa xmm4, xmmword ptr kShufAb2 - movdqa xmm5, xmmword ptr kScaleAb2 - - xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 - movdqu xmm1, [eax + esi] - lea eax, [eax + 16] - pavgb xmm0, xmm1 - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - pxor xmm5, xmm5 - - // sum rows - xloop: - movdqu xmm3, [eax] // read 16 bytes - lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination - movdqu xmm1, [edx + 16] - movdqa xmm2, xmm3 - punpcklbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 16 - jg xloop - ret - } -} - -#ifdef HAS_SCALEADDROW_AVX2 -// Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - vpxor ymm5, ymm5, ymm5 - - // sum rows - xloop: - vmovdqu ymm3, [eax] // read 32 bytes - lea eax, [eax + 32] - vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck - vpunpcklbw ymm2, ymm3, ymm5 - vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words - vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 32 - jg xloop - - vzeroupper - ret - } -} -#endif // HAS_SCALEADDROW_AVX2 - -// Constant for making pixels signed to avoid pmaddubsw -// saturation. -static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Constant for making pixels unsigned and adding .5 for rounding. -static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; - -// Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 - psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. - movd ebx, xmm1 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit - paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits - movd ebx, xmm2 - mov [edi], bl - - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} - -// Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - ret - } -} - -// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop esi - ret - } -} - -// Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop ebx - ret - } -} - -// Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} - -// Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 // 4 pixels - jge xloop4 - - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - xloop99: - - pop esi - pop edi - ret - } -} - -// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. -// TODO(fbarchard): Port to Neon - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static const uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static const uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, xmmword ptr kShuffleColARGB - movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - xloop99: - - pop edi - pop esi - ret - } -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - idiv dword ptr [esp + 8] - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv1_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - sub eax, 0x00010001 - sbb edx, 0 - sub ecx, 1 - idiv ecx - ret - } -} -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif