media: pci/tbscapture2: Optimized for X86 SIMD.

This commit is contained in:
CrazyCat
2025-03-19 19:49:50 +02:00
parent 75a92ecbf1
commit a668e81971
28 changed files with 24 additions and 40995 deletions

View File

@@ -1,45 +1,35 @@
#EXTRA_CFLAGS += -I drivers/media/dvb-core
#EXTRA_CFLAGS += -I drivers/media/pci/tbscapture/include
#EXTRA_CFLAGS += -I /lib/modules/`uname -r`/build/include
#EXTRA_CFLAGS += -I /lib/modules/`uname -r`/build/include/linux
ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include
ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include/libyuv ccflags-y += -I$(srctree)/drivers/media/pci/tbscapture2/include/libyuv
ccflags-y += -I$(srctree)/include/linux ccflags-y += -I$(srctree)/include/linux
EXTRA_CFLAGS += -mhard-float -msse -msse2 CFLAGS_X86 = -mhard-float -msse -msse2
CFLAGS_compare.o += $(CFLAGS_X86)
CFLAGS_compare_gcc.o += $(CFLAGS_X86)
CFLAGS_rotate_gcc.o += $(CFLAGS_X86)
CFLAGS_row_common.o += $(CFLAGS_X86)
CFLAGS_row_gcc.o += $(CFLAGS_X86)
CFLAGS_scale_gcc.o += $(CFLAGS_X86)
CFLAGS_planar_functions.o += $(CFLAGS_X86)
#CC_FLAGS_FPU := -mhard-float -msse -msse2
obj-$(CONFIG_TBS_PCIE2_CAP) += tbs_pcie2-cap.o obj-$(CONFIG_TBS_PCIE2_CAP) += tbs_pcie2-cap.o
tbs_pcie2-cap-objs += tbs_pcie2.o \ tbs_pcie2-cap-objs += tbs_pcie2.o \
other.o \ other.o \
compare.o \ compare.o \
compare_common.o \ compare_common.o \
compare_gcc.o \ compare_gcc.o \
compare_msa.o \
compare_neon.o \
compare_neon64.o \
convert_from.o \ convert_from.o \
convert_from_argb.o \ convert_from_argb.o \
convert_jpeg.o \ convert_jpeg.o \
convert_to_argb.o \ convert_to_argb.o \
compare_win.o \
convert.o \ convert.o \
convert_argb.o \ convert_argb.o \
rotate_argb.o \ rotate_argb.o \
rotate_common.o \ rotate_common.o \
rotate_gcc.o \ rotate_gcc.o \
rotate_lsx.o \
rotate_msa.o \
rotate_neon.o \
rotate_neon64.o \
rotate_sme.o \
rotate_win.o \
row_any.o \ row_any.o \
row_common.o \ row_common.o \
row_gcc.o \ row_gcc.o \
row_lasx.o \
row_lsx.o \
mjpeg_validate.o \ mjpeg_validate.o \
planar_functions.o \ planar_functions.o \
rotate.o \ rotate.o \
@@ -47,23 +37,11 @@ rotate_any.o \
convert_to_i420.o \ convert_to_i420.o \
cpu_id.o \ cpu_id.o \
mjpeg_decoder.o \ mjpeg_decoder.o \
row_msa.o \
row_neon.o \
row_neon64.o \
row_rvv.o \
row_sve.o \
row_win.o \
scale.o \ scale.o \
scale_any.o \ scale_any.o \
scale_argb.o \ scale_argb.o \
scale_common.o \ scale_common.o \
scale_gcc.o \ scale_gcc.o \
scale_lsx.o \
scale_msa.o \
scale_neon.o \
scale_neon64.o \
scale_rgb.o \ scale_rgb.o \
scale_rvv.o \
scale_uv.o \ scale_uv.o \
scale_win.o \
video_common.o video_common.o

View File

@@ -1,97 +0,0 @@
/*
* Copyright 2017 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "basic_types.h"
#include "compare_row.h"
#include "row.h"
// This module is for GCC MSA
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include "macros_msa.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
uint32_t HammingDistance_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff = 0u;
int i;
v16u8 src0, src1, src2, src3;
v2i64 vec0 = {0}, vec1 = {0};
for (i = 0; i < count; i += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
src0 ^= src2;
src1 ^= src3;
vec0 += __msa_pcnt_d((v2i64)src0);
vec1 += __msa_pcnt_d((v2i64)src1);
src_a += 32;
src_b += 32;
}
vec0 += vec1;
diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
return diff;
}
uint32_t SumSquareError_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse = 0u;
int i;
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2, vec3;
v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
v2i64 tmp0;
for (i = 0; i < count; i += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
src_a += 32;
src_b += 32;
}
reg0 += reg1;
reg2 += reg3;
reg0 += reg2;
tmp0 = __msa_hadd_s_d(reg0, reg0);
sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
return sse;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

View File

@@ -1,96 +0,0 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "basic_types.h"
#include "compare_row.h"
#include "row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
// 256 bits at a time
// uses short accumulator which restricts count to 131 KB
uint32_t HammingDistance_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff;
asm volatile (
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
}
uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,223 +0,0 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "basic_types.h"
#include "compare_row.h"
#include "row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// 256 bits at a time
// uses short accumulator which restricts count to 131 KB
uint32_t HammingDistance_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff;
asm volatile (
"movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile (
"movi v16.16b, #0 \n"
"movi v17.16b, #0 \n"
"movi v18.16b, #0 \n"
"movi v19.16b, #0 \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"prfm pldl1keep, [%1, 448] \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
static const uvec32 kDjb2Multiplicands[] = {
{0x0c3525e1, // 33^15
0xa3476dc1, // 33^14
0x3b4039a1, // 33^13
0x4f5f0981}, // 33^12
{0x30f35d61, // 33^11
0x855cb541, // 33^10
0x040a9121, // 33^9
0x747c7101}, // 33^8
{0xec41d4e1, // 33^7
0x4cfa3cc1, // 33^6
0x025528a1, // 33^5
0x00121881}, // 33^4
{0x00008c61, // 33^3
0x00000441, // 33^2
0x00000021, // 33^1
0x00000001}, // 33^0
};
static const uvec32 kDjb2WidenIndices[] = {
{0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U},
{0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U},
{0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU},
{0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU},
};
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash = seed;
const uint32_t c16 = 0x92d9e201; // 33^16
uint32_t tmp, tmp2;
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
// count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores.
"1: \n"
"ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n"
"tbl v2.16b, {v0.16b}, v18.16b \n"
"tbl v1.16b, {v0.16b}, v17.16b \n"
"tbl v0.16b, {v0.16b}, v16.16b \n"
"mul v3.4s, v3.4s, v7.4s \n"
"mul v2.4s, v2.4s, v6.4s \n"
"mla v3.4s, v1.4s, v5.4s \n"
"mla v2.4s, v0.4s, v4.4s \n"
"addv s1, v3.4s \n"
"addv s0, v2.4s \n"
"fmov %w[tmp2], s1 \n"
"fmov %w[tmp], s0 \n"
"add %w[tmp], %w[tmp], %w[tmp2] \n"
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
"b.gt 1b \n"
: [hash] "+r"(hash), // %[hash]
[count] "+r"(count), // %[count]
[tmp] "=&r"(tmp), // %[tmp]
[tmp2] "=&r"(tmp2) // %[tmp2]
: [src] "r"(src), // %[src]
[kMuls] "r"(kDjb2Multiplicands), // %[kMuls]
[kIdx] "r"(kDjb2WidenIndices), // %[kIdx]
[c16] "r"(c16) // %[c16]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19");
return hash;
}
uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff;
asm volatile (
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"movi v6.16b, #1 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"ldp q2, q3, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"udot v4.4s, v0.16b, v6.16b \n"
"udot v5.4s, v1.16b, v6.16b \n"
"b.gt 1b \n"
"add v0.4s, v4.4s, v5.4s \n"
"addv s0, v0.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
return diff;
}
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
// count is guaranteed to be a multiple of 32.
uint32_t sse;
asm volatile (
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"1: \n"
"ldp q0, q2, [%0], #32 \n"
"ldp q1, q3, [%1], #32 \n"
"subs %w2, %w2, #32 \n"
"uabd v0.16b, v0.16b, v1.16b \n"
"uabd v1.16b, v2.16b, v3.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"udot v4.4s, v0.16b, v0.16b \n"
"udot v5.4s, v1.16b, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
"add v0.4s, v4.4s, v5.4s \n"
"addv s0, v0.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,241 +0,0 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "basic_types.h"
#include "compare_row.h"
#include "row.h"
#if defined(_MSC_VER)
#include <intrin.h> // For __popcnt
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
!defined(__clang__) && defined(_M_IX86)
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff = 0u;
int i;
for (i = 0; i < count - 3; i += 4) {
uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
src_a += 4;
src_b += 4;
diff += __popcnt(x);
}
return diff;
}
__declspec(naked) uint32_t
SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
wloop:
movdqu xmm1, [eax]
lea eax, [eax + 16]
movdqu xmm2, [edx]
lea edx, [edx + 16]
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
por xmm1, xmm2
movdqa xmm2, xmm1
punpcklbw xmm1, xmm5
punpckhbw xmm2, xmm5
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
paddd xmm0, xmm1
pshufd xmm1, xmm0, 0x01
paddd xmm0, xmm1
movd eax, xmm0
ret
}
}
#ifdef HAS_SUMSQUAREERROR_AVX2
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable : 4752)
__declspec(naked) uint32_t
SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
vpunpckhbw ymm1, ymm1, ymm5
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0
vzeroupper
ret
}
}
#endif // HAS_SUMSQUAREERROR_AVX2
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
__declspec(naked) uint32_t
HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
ret
}
}
// Visual C 2012 required for AVX2.
#ifdef HAS_HASHDJB2_AVX2
__declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
wloop:
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
sub ecx, 16
jg wloop
vmovd eax, xmm0 // return hash
vzeroupper
ret
}
}
#endif // HAS_HASHDJB2_AVX2
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,244 +0,0 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
#define INCLUDE_LIBYUV_MACROS_MSA_H_
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include <msa.h>
#include <stdint.h>
#if (__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
uint32_t val_m; \
asm("lw %[val_m], %[psrc_lw_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
uint64_t val_m = 0; \
asm("ld %[val_m], %[psrc_ld_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
uint32_t val0_m, val1_m; \
uint64_t val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64_t)(val1_m); /* NOLINT */ \
val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm("sw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#if (__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint64_t val_m = (val); \
asm("sd %[val_m], %[pdst_sd_m] \n" \
: [pdst_sd_m] "=m"(*pdst_sd_m) \
: [val_m] "r"(val_m)); \
})
#else // !(__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
SW(val0_m, pdst_sd_m); \
SW(val1_m, pdst_sd_m + 4); \
})
#endif // !(__mips == 64)
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
uint32_t val_lw_m; \
\
asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
\
: [val_lw_m] "=&r"(val_lw_m) \
: [psrc_lw_m] "r"(psrc_lw_m)); \
\
val_lw_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
uint64_t val_ld_m = 0; \
\
asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
\
: [val_ld_m] "=&r"(val_ld_m) \
: [psrc_ld_m] "r"(psrc_ld_m)); \
\
val_ld_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
uint32_t val0_m, val1_m; \
uint64_t val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64_t)(val1_m); /* NOLINT */ \
val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
SW(val0_m, pdst_sd_m); \
SW(val1_m, pdst_sd_m + 4); \
})
#endif // (__mips_isa_rev >= 6)
// TODO(fbarchard): Consider removing __VAR_ARGS versions.
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
/* Description : Load two vectors with 16 'byte' sized elements
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - as per RTYPE
Details : Load 16 byte elements in 'out0' from (psrc)
Load 16 byte elements in 'out1' from (psrc + stride)
*/
#define LD_B2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \
}
#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
/* Description : Store two vectors with stride each having 16 'byte' sized
elements
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 16 byte elements from 'in0' to (pdst)
Store 16 byte elements from 'in1' to (pdst + stride)
*/
#define ST_B2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_B2(RTYPE, in0, in1, (pdst), stride); \
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 8 halfword elements from 'in0' to (pdst)
Store 8 halfword elements from 'in1' to (pdst + stride)
*/
#define ST_H2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_H(RTYPE, in0, (pdst)); \
ST_H(RTYPE, in1, (pdst) + stride); \
}
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'in0' & 'in1' are copied selectively to
'out0' as per control vector 'mask0'
*/
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
{ \
out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
}
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
#endif // INCLUDE_LIBYUV_MACROS_MSA_H_

View File

@@ -1,15 +1,15 @@
#include <linux/pci.h> #include <linux/pci.h>
#include "tbs_pcie-reg.h" #include "tbs_pcie-reg.h"
#include "tbs_pcie.h" #include "tbs_pcie.h"
void *malloc(size_t __size); void *malloc(size_t __size);
void *malloc(size_t __size) void *malloc(size_t __size)
{ {
return kzalloc(__size, GFP_KERNEL); return kzalloc(__size, GFP_KERNEL);
} }
void free(void *__ptr); void free(void *__ptr);
void free(void *__ptr) void free(void *__ptr)
{ {
if(__ptr) if(__ptr)
kfree(__ptr); kfree(__ptr);
} }

View File

@@ -1,233 +0,0 @@
/*
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
*
* Copyright (c) 2022 Loongson Technology Corporation Limited
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
#include "loongson_intrinsics.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
}
#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
}
#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
}
#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
}
#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
_stride3, _stride4) \
{ \
__lsx_vst(_dst0, _dst, 0); \
__lsx_vstx(_dst1, _dst, _stride); \
__lsx_vstx(_dst2, _dst, _stride2); \
__lsx_vstx(_dst3, _dst, _stride3); \
_dst += _stride4; \
}
#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
{ \
__lsx_vst(_dst0, _dst, 0); \
__lsx_vstx(_dst1, _dst, _stride); \
_dst += _stride2; \
}
void TransposeUVWx16_C(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
dst_stride_a, (dst_b + 8), dst_stride_b, width);
}
void TransposeWx16_LSX(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
int x;
int len = width / 16;
uint8_t* s;
int src_stride2 = src_stride << 1;
int src_stride3 = src_stride + src_stride2;
int src_stride4 = src_stride2 << 1;
int dst_stride2 = dst_stride << 1;
int dst_stride3 = dst_stride + dst_stride2;
int dst_stride4 = dst_stride2 << 1;
__m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
__m128i tmp0, tmp1, tmp2, tmp3;
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
__m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < len; x++) {
s = (uint8_t*)src;
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
res8 = __lsx_vilvl_w(reg4, reg0);
res9 = __lsx_vilvh_w(reg4, reg0);
ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
dst_stride4);
res8 = __lsx_vilvl_w(reg5, reg1);
res9 = __lsx_vilvh_w(reg5, reg1);
ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
dst_stride4);
res8 = __lsx_vilvl_w(reg6, reg2);
res9 = __lsx_vilvh_w(reg6, reg2);
ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
dst_stride4);
res8 = __lsx_vilvl_w(reg7, reg3);
res9 = __lsx_vilvh_w(reg7, reg3);
ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
dst_stride4);
src += 16;
}
}
void TransposeUVWx16_LSX(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
int x;
int len = width / 8;
uint8_t* s;
int src_stride2 = src_stride << 1;
int src_stride3 = src_stride + src_stride2;
int src_stride4 = src_stride2 << 1;
int dst_stride_a2 = dst_stride_a << 1;
int dst_stride_b2 = dst_stride_b << 1;
__m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
__m128i tmp0, tmp1, tmp2, tmp3;
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
__m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < len; x++) {
s = (uint8_t*)src;
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
src0 = __lsx_vld(s, 0);
src1 = __lsx_vldx(s, src_stride);
src2 = __lsx_vldx(s, src_stride2);
src3 = __lsx_vldx(s, src_stride3);
s += src_stride4;
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
res8 = __lsx_vilvl_w(reg4, reg0);
res9 = __lsx_vilvh_w(reg4, reg0);
ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
res8 = __lsx_vilvl_w(reg5, reg1);
res9 = __lsx_vilvh_w(reg5, reg1);
ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
res8 = __lsx_vilvl_w(reg6, reg2);
res9 = __lsx_vilvh_w(reg6, reg2);
ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
res8 = __lsx_vilvl_w(reg7, reg3);
res9 = __lsx_vilvh_w(reg7, reg3);
ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
src += 16;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)

View File

@@ -1,240 +0,0 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
// This module is for GCC MSA
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include "macros_msa.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \
out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \
out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \
out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \
}
#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \
out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \
out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \
out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \
}
#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \
out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \
out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \
out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \
}
#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \
out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \
out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
}
void TransposeUVWx16_C(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
dst_stride_a, (dst_b + 8), dst_stride_b, width);
}
void TransposeWx16_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
int x;
const uint8_t* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < width; x += 16) {
s = src;
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
src += 16;
dst += dst_stride * 4;
}
}
void TransposeUVWx16_MSA(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
int x;
const uint8_t* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < width; x += 8) {
s = src;
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
src += 16;
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

View File

@@ -1,219 +0,0 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
#include "row.h"
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
void TransposeWx8_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
"vld1.8 {d7}, [%[temp]] \n"
"add %[src], #8 \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"subs %[width], #8 \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"mov %[temp], %[dst] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d6}, [%[temp]] \n"
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
"bge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst] "+r"(dst), // %[dst]
[width] "+r"(width) // %[width]
: [src_stride] "r"(src_stride), // %[src_stride]
[dst_stride] "r"(dst_stride) // %[dst_stride]
: "memory", "cc", "q0", "q1", "q2", "q3");
}
void TransposeUVWx8_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
const uint8_t* temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
"vld2.8 {d22, d23}, [%[temp]] \n"
"add %[src], #8*2 \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n"
"subs %[width], #8 \n"
"vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"mov %[temp], %[dst_a] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d20}, [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d21}, [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"bge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a]
[dst_b] "+r"(dst_b), // %[dst_b]
[width] "+r"(width) // %[width]
: [src_stride] "r"(src_stride), // %[src_stride]
[dst_stride_a] "r"(dst_stride_a), // %[dst_stride_a]
[dst_stride_b] "r"(dst_stride_b) // %[dst_stride_b]
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile (
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
"subs %8, %8, #4 \n" // w -= 4
"vst1.8 {q0}, [%4]! \n"
"vst1.8 {q1}, [%5]! \n"
"vst1.8 {q2}, [%6]! \n"
"vst1.8 {q3}, [%7]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(dst1), // %5
"+r"(dst2), // %6
"+r"(dst3), // %7
"+r"(width) // %8
: "r"((ptrdiff_t)(src_stride * 4)) // %9
: "memory", "cc", "q0", "q1", "q2", "q3");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,273 +0,0 @@
/*
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
#include "row.h"
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
void TransposeWx16_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src_temp;
asm volatile (
"1: \n"
"mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
"add %[src], %[src], #16 \n"
// Transpose bytes within each 2x2 block.
"trn1 v0.16b, v16.16b, v17.16b \n"
"trn2 v1.16b, v16.16b, v17.16b \n"
"trn1 v2.16b, v18.16b, v19.16b \n"
"trn2 v3.16b, v18.16b, v19.16b \n"
"trn1 v4.16b, v20.16b, v21.16b \n"
"trn2 v5.16b, v20.16b, v21.16b \n"
"trn1 v6.16b, v22.16b, v23.16b \n"
"trn2 v7.16b, v22.16b, v23.16b \n"
"trn1 v8.16b, v24.16b, v25.16b \n"
"trn2 v9.16b, v24.16b, v25.16b \n"
"trn1 v10.16b, v26.16b, v27.16b \n"
"trn2 v11.16b, v26.16b, v27.16b \n"
"trn1 v12.16b, v28.16b, v29.16b \n"
"trn2 v13.16b, v28.16b, v29.16b \n"
"trn1 v14.16b, v30.16b, v31.16b \n"
"trn2 v15.16b, v30.16b, v31.16b \n"
// Transpose 2x2-byte blocks within each 4x4 block.
"trn1 v16.8h, v0.8h, v2.8h \n"
"trn1 v17.8h, v1.8h, v3.8h \n"
"trn2 v18.8h, v0.8h, v2.8h \n"
"trn2 v19.8h, v1.8h, v3.8h \n"
"trn1 v20.8h, v4.8h, v6.8h \n"
"trn1 v21.8h, v5.8h, v7.8h \n"
"trn2 v22.8h, v4.8h, v6.8h \n"
"trn2 v23.8h, v5.8h, v7.8h \n"
"trn1 v24.8h, v8.8h, v10.8h \n"
"trn1 v25.8h, v9.8h, v11.8h \n"
"trn2 v26.8h, v8.8h, v10.8h \n"
"trn2 v27.8h, v9.8h, v11.8h \n"
"trn1 v28.8h, v12.8h, v14.8h \n"
"trn1 v29.8h, v13.8h, v15.8h \n"
"trn2 v30.8h, v12.8h, v14.8h \n"
"trn2 v31.8h, v13.8h, v15.8h \n"
"subs %w[width], %w[width], #16 \n"
// Transpose 4x4-byte blocks within each 8x8 block.
"trn1 v0.4s, v16.4s, v20.4s \n"
"trn1 v2.4s, v17.4s, v21.4s \n"
"trn1 v4.4s, v18.4s, v22.4s \n"
"trn1 v6.4s, v19.4s, v23.4s \n"
"trn2 v8.4s, v16.4s, v20.4s \n"
"trn2 v10.4s, v17.4s, v21.4s \n"
"trn2 v12.4s, v18.4s, v22.4s \n"
"trn2 v14.4s, v19.4s, v23.4s \n"
"trn1 v1.4s, v24.4s, v28.4s \n"
"trn1 v3.4s, v25.4s, v29.4s \n"
"trn1 v5.4s, v26.4s, v30.4s \n"
"trn1 v7.4s, v27.4s, v31.4s \n"
"trn2 v9.4s, v24.4s, v28.4s \n"
"trn2 v11.4s, v25.4s, v29.4s \n"
"trn2 v13.4s, v26.4s, v30.4s \n"
"trn2 v15.4s, v27.4s, v31.4s \n"
// Transpose 8x8-byte blocks and store.
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
"b.gt 1b \n"
: [src] "+r"(src), // %[src]
[src_temp] "=&r"(src_temp), // %[src_temp]
[dst] "+r"(dst), // %[dst]
[width] "+r"(width) // %[width]
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
[dst_stride] "r"((ptrdiff_t)dst_stride) // %[dst_stride]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v29", "v30", "v31");
}
void TransposeUVWx8_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width) {
const uint8_t* temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w[width], %w[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v7.16b}, [%[temp]] \n"
"add %[src], %[src], #16 \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"subs %w[width], %w[width], #8 \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %[temp], %[dst_a] \n"
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[1], [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[1], [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"b.ge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a]
[dst_b] "+r"(dst_b), // %[dst_b]
[width] "+r"(width) // %[width]
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b) // %[dst_stride_b]
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile (
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
"subs %w8, %w8, #4 \n" // w -= 4
"st1 {v0.4s}, [%4], 16 \n"
"st1 {v1.4s}, [%5], 16 \n"
"st1 {v2.4s}, [%6], 16 \n"
"st1 {v3.4s}, [%7], 16 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(dst1), // %5
"+r"(dst2), // %6
"+r"(dst3), // %7
"+r"(width) // %8
: "r"((ptrdiff_t)(src_stride * 4)) // %9
: "memory", "cc", "v0", "v1", "v2", "v3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,174 +0,0 @@
/*
* Copyright 2024 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
#include "row.h"
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
__arm_locally_streaming __arm_new("za") void TransposeWxH_SME(
const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width,
int height) {
int vl;
asm("cntb %x0" : "=r"(vl));
do {
const uint8_t* src2 = src;
uint8_t* dst2 = dst;
// Process up to VL elements per iteration of the inner loop.
int block_height = height > vl ? vl : height;
int width2 = width;
do {
const uint8_t* src3 = src2;
// Process up to VL elements per iteration of the inner loop.
int block_width = width2 > vl ? vl : width2;
asm volatile(
"mov w12, #0 \n"
// Create a predicate to handle loading partial rows.
"whilelt p0.b, wzr, %w[block_width] \n"
// Load H <= VL rows into ZA0.
"1: \n"
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src3]] \n"
"add %[src3], %[src3], %[src_stride] \n"
"add w12, w12, #1 \n"
"cmp w12, %w[block_height] \n"
"b.ne 1b \n"
// Create a predicate to handle storing partial columns.
"whilelt p0.b, wzr, %w[block_height] \n"
"mov w12, #0 \n"
// Store W <= VL columns from ZA0.
"2: \n"
"st1b {za0v.b[w12, 0]}, p0, [%[dst2]] \n"
"add %[dst2], %[dst2], %[dst_stride] \n"
"add w12, w12, #1 \n"
"cmp w12, %w[block_width] \n"
"b.ne 2b \n"
: [src3] "+r"(src3), // %[src3]
[dst2] "+r"(dst2) // %[dst2]
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
[dst_stride] "r"((ptrdiff_t)dst_stride), // %[dst_stride]
[block_width] "r"(block_width), // %[block_width]
[block_height] "r"(block_height) // %[block_height]
: "cc", "memory", "p0", "w12", "za");
src2 += vl;
width2 -= vl;
} while (width2 > 0);
src += vl * src_stride;
dst += vl;
height -= vl;
} while (height > 0);
}
__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME(
const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
int vl;
asm("cnth %x0" : "=r"(vl));
do {
const uint8_t* src2 = src;
uint8_t* dst2_a = dst_a;
uint8_t* dst2_b = dst_b;
// Process up to VL bytes per iteration of the inner loop.
int block_height = height > vl * 2 ? vl * 2 : height;
int width2 = width;
do {
const uint8_t* src3 = src2;
// Process up to VL 16-bit elements per iteration of the inner loop.
int block_width = width2 > vl ? vl : width2;
asm volatile(
"mov w12, #0 \n"
// Create a predicate to handle loading partial rows,
// %[block_width] is always a multiple of two here.
"whilelt p0.b, wzr, %w[block_width] \n"
// Load H <= VL rows into ZA0, such that U/V components exist in
// alternating columns.
"1: \n"
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n"
"add %[src], %[src], %[src_stride] \n"
"add w12, w12, #1 \n"
"cmp w12, %w[block_height] \n"
"b.ne 1b \n"
// Create a predicate to handle storing partial columns.
"whilelt p0.b, wzr, %w[block_height] \n"
"mov w12, #0 \n"
// Store alternating UV data from pairs of ZA0 columns.
"2: \n"
"st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n"
"st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b] \n"
"add w12, w12, #2 \n"
"cmp w12, %w[block_width] \n"
"b.ne 2b \n"
: [src] "+r"(src3), // %[src]
[dst_a] "+r"(dst2_a), // %[dst_a]
[dst_b] "+r"(dst2_b) // %[dst_b]
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b]
[block_width] "r"(block_width * 2), // %[block_width]
[block_height] "r"(block_height) // %[block_height]
: "cc", "memory", "p0", "w12", "za");
src2 += 2 * vl;
width2 -= vl;
} while (width2 > 0);
src += 2 * vl * src_stride;
dst_a += 2 * vl;
dst_b += 2 * vl;
height -= 2 * vl;
} while (height > 0);
}
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -1,253 +0,0 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "rotate_row.h"
#include "row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
!defined(__clang__) && defined(_M_IX86)
__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
int src_stride,
uint8_t* dst_a,
int dst_stride_a,
uint8_t* dst_b,
int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
// Read in the data from the source pointer.
// First round of bit swap.
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,739 +0,0 @@
/*
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
*
* Copyright (c) 2022 Loongson Technology Corporation Limited
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "scale_row.h"
#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
#include "loongson_intrinsics.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define LOAD_DATA(_src, _in, _out) \
{ \
int _tmp1, _tmp2, _tmp3, _tmp4; \
DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
_tmp2, _tmp3, _tmp4); \
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \
}
void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
int len = dst_width / 4;
(void)src_stride;
__m128i src0, src1, dst0;
for (x = 0; x < len; x++) {
DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
dst0 = __lsx_vpickod_w(src1, src0);
__lsx_vst(dst0, dst_argb, 0);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
int len = dst_width / 4;
(void)src_stride;
__m128i src0, src1, tmp0, tmp1, dst0;
for (x = 0; x < len; x++) {
DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
tmp0 = __lsx_vpickev_w(src1, src0);
tmp1 = __lsx_vpickod_w(src1, src0);
dst0 = __lsx_vavgr_bu(tmp1, tmp0);
__lsx_vst(dst0, dst_argb, 0);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
int len = dst_width / 4;
const uint8_t* s = src_argb;
const uint8_t* t = src_argb + src_stride;
__m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
__m128i reg0, reg1, reg2, reg3;
__m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
for (x = 0; x < len; x++) {
DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
tmp3, reg0, reg1, reg2, reg3);
DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
__lsx_vst(dst0, dst_argb, 0);
s += 32;
t += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_argb,
int dst_width) {
int x;
int len = dst_width / 4;
int32_t stepx = src_stepx << 2;
(void)src_stride;
__m128i dst0, dst1, dst2, dst3;
for (x = 0; x < len; x++) {
dst0 = __lsx_vldrepl_w(src_argb, 0);
src_argb += stepx;
dst1 = __lsx_vldrepl_w(src_argb, 0);
src_argb += stepx;
dst2 = __lsx_vldrepl_w(src_argb, 0);
src_argb += stepx;
dst3 = __lsx_vldrepl_w(src_argb, 0);
src_argb += stepx;
__lsx_vstelm_w(dst0, dst_argb, 0, 0);
__lsx_vstelm_w(dst1, dst_argb, 4, 0);
__lsx_vstelm_w(dst2, dst_argb, 8, 0);
__lsx_vstelm_w(dst3, dst_argb, 12, 0);
dst_argb += 16;
}
}
void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_argb,
int dst_width) {
int x;
int len = dst_width / 4;
int32_t stepx = src_stepx * 4;
const uint8_t* next_argb = src_argb + src_stride;
__m128i src0, src1, src2, src3;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, dst0;
for (x = 0; x < len; x++) {
tmp0 = __lsx_vldrepl_d(src_argb, 0);
src_argb += stepx;
tmp1 = __lsx_vldrepl_d(src_argb, 0);
src_argb += stepx;
tmp2 = __lsx_vldrepl_d(src_argb, 0);
src_argb += stepx;
tmp3 = __lsx_vldrepl_d(src_argb, 0);
src_argb += stepx;
tmp4 = __lsx_vldrepl_d(next_argb, 0);
next_argb += stepx;
tmp5 = __lsx_vldrepl_d(next_argb, 0);
next_argb += stepx;
tmp6 = __lsx_vldrepl_d(next_argb, 0);
next_argb += stepx;
tmp7 = __lsx_vldrepl_d(next_argb, 0);
next_argb += stepx;
DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
__lsx_vst(dst0, dst_argb, 0);
dst_argb += 16;
}
}
void ScaleRowDown2_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
int len = dst_width / 32;
__m128i src0, src1, src2, src3, dst0, dst1;
(void)src_stride;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
__lsx_vst(dst0, dst, 0);
__lsx_vst(dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
int len = dst_width / 32;
__m128i src0, src1, src2, src3;
__m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
(void)src_stride;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
__lsx_vst(dst0, dst, 0);
__lsx_vst(dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
int len = dst_width / 32;
const uint8_t* src_nex = src_ptr + src_stride;
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i dst0, dst1;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
src4, src5, src6, src7);
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp0, tmp2, tmp4, tmp6);
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp1, tmp3, tmp5, tmp7);
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
__lsx_vst(dst0, dst, 0);
__lsx_vst(dst1, dst, 16);
src_ptr += 64;
src_nex += 64;
dst += 32;
}
}
void ScaleRowDown4_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
int len = dst_width / 16;
__m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
(void)src_stride;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
dst0 = __lsx_vpickod_b(tmp1, tmp0);
__lsx_vst(dst0, dst, 0);
src_ptr += 64;
dst += 16;
}
}
void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
int len = dst_width / 16;
const uint8_t* ptr1 = src_ptr + src_stride;
const uint8_t* ptr2 = ptr1 + src_stride;
const uint8_t* ptr3 = ptr2 + src_stride;
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
src6, src7);
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp0, tmp2, tmp4, tmp6);
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp1, tmp3, tmp5, tmp7);
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
reg0, reg1, reg2, reg3);
DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
src2, src3);
DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
src6, src7);
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp0, tmp2, tmp4, tmp6);
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
src7, tmp1, tmp3, tmp5, tmp7);
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
reg4, reg5, reg6, reg7);
DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
reg0, reg1, reg2, reg3);
DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
reg3, reg0, reg1, reg2, reg3);
DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
dst0 = __lsx_vpickev_b(tmp1, tmp0);
__lsx_vst(dst0, dst, 0);
src_ptr += 64;
ptr1 += 64;
ptr2 += 64;
ptr3 += 64;
dst += 16;
}
}
void ScaleRowDown38_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x, len;
__m128i src0, src1, tmp0;
__m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
assert(dst_width % 3 == 0);
len = dst_width / 12;
(void)src_stride;
for (x = 0; x < len; x++) {
DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
tmp0 = __lsx_vshuf_b(src1, src0, shuff);
__lsx_vstelm_d(tmp0, dst, 0, 0);
__lsx_vstelm_w(tmp0, dst, 8, 2);
src_ptr += 32;
dst += 12;
}
}
void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, len;
const uint8_t* src_nex = src_ptr + src_stride;
__m128i src0, src1, src2, src3, dst0;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, reg2, reg3;
__m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
__m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
__m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
assert((dst_width % 3 == 0) && (dst_width > 0));
len = dst_width / 12;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
src1, src2, src3);
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
tmp4 = __lsx_vpickev_w(reg3, reg2);
tmp5 = __lsx_vadd_h(reg0, reg1);
tmp6 = __lsx_vadd_h(tmp5, tmp4);
tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
tmp0 = __lsx_vpickod_w(reg3, reg2);
tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
__lsx_vstelm_d(dst0, dst_ptr, 0, 0);
__lsx_vstelm_w(dst0, dst_ptr, 8, 2);
src_ptr += 32;
src_nex += 32;
dst_ptr += 12;
}
}
void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, len;
const uint8_t* ptr1 = src_ptr + src_stride;
const uint8_t* ptr2 = ptr1 + src_stride;
__m128i src0, src1, src2, src3, src4, src5;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, reg2, reg3, dst0;
__m128i zero = __lsx_vldi(0);
__m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
__m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
__m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
assert((dst_width % 3 == 0) && (dst_width > 0));
len = dst_width / 12;
for (x = 0; x < len; x++) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
src2, src3);
DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
tmp4 = __lsx_vpickev_w(reg3, reg2);
tmp5 = __lsx_vadd_h(reg0, reg1);
tmp6 = __lsx_vadd_h(tmp5, tmp4);
tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
tmp0 = __lsx_vpickod_w(reg3, reg2);
tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
__lsx_vstelm_d(dst0, dst_ptr, 0, 0);
__lsx_vstelm_w(dst0, dst_ptr, 8, 2);
src_ptr += 32;
ptr1 += 32;
ptr2 += 32;
dst_ptr += 12;
}
}
void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
int x;
int len = src_width / 16;
__m128i src0, tmp0, tmp1, dst0, dst1;
__m128i zero = __lsx_vldi(0);
assert(src_width > 0);
for (x = 0; x < len; x++) {
src0 = __lsx_vld(src_ptr, 0);
DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
tmp0 = __lsx_vilvl_b(zero, src0);
tmp1 = __lsx_vilvh_b(zero, src0);
DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
__lsx_vst(dst0, dst_ptr, 0);
__lsx_vst(dst1, dst_ptr, 16);
src_ptr += 16;
dst_ptr += 16;
}
}
void ScaleFilterCols_LSX(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
int j;
int len = dst_width / 16;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
__m128i vec0, vec1, dst0;
__m128i vec_x = __lsx_vreplgr2vr_w(x);
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
__m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
__m128i const2 = __lsx_vreplgr2vr_w(0x40);
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
vec0 = __lsx_vmul_w(vec_dx, const_tmp);
vec1 = __lsx_vslli_w(vec_dx, 2);
vec_x = __lsx_vadd_w(vec_x, vec0);
for (j = 0; j < len; j++) {
tmp0 = __lsx_vsrai_w(vec_x, 16);
tmp4 = __lsx_vand_v(vec_x, const1);
vec_x = __lsx_vadd_w(vec_x, vec1);
tmp1 = __lsx_vsrai_w(vec_x, 16);
tmp5 = __lsx_vand_v(vec_x, const1);
vec_x = __lsx_vadd_w(vec_x, vec1);
tmp2 = __lsx_vsrai_w(vec_x, 16);
tmp6 = __lsx_vand_v(vec_x, const1);
vec_x = __lsx_vadd_w(vec_x, vec1);
tmp3 = __lsx_vsrai_w(vec_x, 16);
tmp7 = __lsx_vand_v(vec_x, const1);
vec_x = __lsx_vadd_w(vec_x, vec1);
DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
tmp6, tmp7);
LOAD_DATA(src_ptr, tmp0, reg0);
LOAD_DATA(src_ptr, tmp1, reg1);
LOAD_DATA(src_ptr, tmp2, reg2);
LOAD_DATA(src_ptr, tmp3, reg3);
DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
tmp2, tmp3);
LOAD_DATA(src_ptr, tmp0, reg4);
LOAD_DATA(src_ptr, tmp1, reg5);
LOAD_DATA(src_ptr, tmp2, reg6);
LOAD_DATA(src_ptr, tmp3, reg7);
DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
reg4, reg5, reg6, reg7);
DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
reg4, reg5, reg6, reg7);
DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
const2, reg4, reg5, reg6, reg7);
DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
reg6, reg7);
DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
reg0, reg1, reg2, reg3);
DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
dst0 = __lsx_vpickev_b(tmp1, tmp0);
__lsx_vst(dst0, dst_ptr, 0);
dst_ptr += 16;
}
}
void ScaleARGBCols_LSX(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
const uint32_t* src = (const uint32_t*)src_argb;
uint32_t* dst = (uint32_t*)dst_argb;
int j;
int len = dst_width / 4;
__m128i tmp0, tmp1, tmp2, dst0;
__m128i vec_x = __lsx_vreplgr2vr_w(x);
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
tmp1 = __lsx_vslli_w(vec_dx, 2);
vec_x = __lsx_vadd_w(vec_x, tmp0);
for (j = 0; j < len; j++) {
tmp2 = __lsx_vsrai_w(vec_x, 16);
vec_x = __lsx_vadd_w(vec_x, tmp1);
LOAD_DATA(src, tmp2, dst0);
__lsx_vst(dst0, dst, 0);
dst += 4;
}
}
void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
const uint32_t* src = (const uint32_t*)src_argb;
int j;
int len = dst_width / 8;
__m128i src0, src1, src2, src3;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
__m128i vec0, vec1, dst0, dst1;
__m128i vec_x = __lsx_vreplgr2vr_w(x);
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
__m128i const_7f = __lsx_vldi(0x7F);
vec0 = __lsx_vmul_w(vec_dx, const_tmp);
vec1 = __lsx_vslli_w(vec_dx, 2);
vec_x = __lsx_vadd_w(vec_x, vec0);
for (j = 0; j < len; j++) {
tmp0 = __lsx_vsrai_w(vec_x, 16);
reg0 = __lsx_vsrai_w(vec_x, 9);
vec_x = __lsx_vadd_w(vec_x, vec1);
tmp1 = __lsx_vsrai_w(vec_x, 16);
reg1 = __lsx_vsrai_w(vec_x, 9);
vec_x = __lsx_vadd_w(vec_x, vec1);
DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
LOAD_DATA(src, tmp0, src0);
LOAD_DATA(src, tmp1, src1);
DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
LOAD_DATA(src, tmp0, src2);
LOAD_DATA(src, tmp1, src3);
DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
__lsx_vst(dst0, dst_argb, 0);
__lsx_vst(dst1, dst_argb, 16);
dst_argb += 32;
}
}
void ScaleRowDown34_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
__m128i src0, src1, src2, src3;
__m128i dst0, dst1, dst2;
__m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
__m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
__m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
dst1);
dst2 = __lsx_vshuf_b(src3, src2, shuff2);
__lsx_vst(dst0, dst, 0);
__lsx_vst(dst1, dst, 16);
__lsx_vst(dst2, dst, 32);
src_ptr += 64;
dst += 48;
}
}
void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* d,
int dst_width) {
const uint8_t* src_nex = src_ptr + src_stride;
int x;
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
__m128i tmp10, tmp11, dst0, dst1, dst2;
__m128i const0 = {0x0103030101010103, 0x0101010303010101};
__m128i const1 = {0x0301010101030301, 0x0103030101010103};
__m128i const2 = {0x0101010303010101, 0x0301010101030301};
__m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
__m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
__m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
__m128i shift0 = {0x0002000200010002, 0x0001000200020001};
__m128i shift1 = {0x0002000100020002, 0x0002000200010002};
__m128i shift2 = {0x0001000200020001, 0x0002000100020002};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
src4, src5, src6, src7);
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
const0, src0, src1, src2, src3);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
const1, src4, src5, src6, src7);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
const2, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
shift0, src0, src1, src2, src3);
DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
shift1, src4, src5, src6, src7);
DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
shift2, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
tmp7, tmp8);
DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
__lsx_vst(dst0, d, 0);
__lsx_vst(dst1, d, 16);
__lsx_vst(dst2, d, 32);
src_ptr += 64;
src_nex += 64;
d += 48;
}
}
void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* d,
int dst_width) {
const uint8_t* src_nex = src_ptr + src_stride;
int x;
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
__m128i tmp10, tmp11, dst0, dst1, dst2;
__m128i const0 = {0x0103030101010103, 0x0101010303010101};
__m128i const1 = {0x0301010101030301, 0x0103030101010103};
__m128i const2 = {0x0101010303010101, 0x0301010101030301};
__m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
__m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
__m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
__m128i shift0 = {0x0002000200010002, 0x0001000200020001};
__m128i shift1 = {0x0002000100020002, 0x0002000200010002};
__m128i shift2 = {0x0001000200020001, 0x0002000100020002};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
src4, src5, src6, src7);
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
const0, src0, src1, src2, src3);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
const1, src4, src5, src6, src7);
DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
const2, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
shift0, src0, src1, src2, src3);
DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
shift1, src4, src5, src6, src7);
DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
shift2, tmp0, tmp1, tmp2, tmp3);
DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
src0, src1, src2, src3);
DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
__lsx_vst(dst0, d, 0);
__lsx_vst(dst1, d, 16);
__lsx_vst(dst2, d, 32);
src_ptr += 64;
src_nex += 64;
d += 48;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)

View File

@@ -1,949 +0,0 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "scale_row.h"
// This module is for GCC MSA
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include "macros_msa.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
{ \
out0[0] = srcp[indx0[0]]; \
out0[1] = srcp[indx0[1]]; \
out0[2] = srcp[indx0[2]]; \
out0[3] = srcp[indx0[3]]; \
}
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
v16u8 src0, src1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
ST_UB(dst0, dst_argb);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
ST_UB(dst0, dst_argb);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
const uint8_t* s = src_argb;
const uint8_t* t = src_argb + src_stride;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3;
v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
reg0 = __msa_hadd_u_h(vec0, vec0);
reg1 = __msa_hadd_u_h(vec1, vec1);
reg2 = __msa_hadd_u_h(vec2, vec2);
reg3 = __msa_hadd_u_h(vec3, vec3);
reg0 += reg2;
reg1 += reg3;
reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_argb);
s += 32;
t += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_argb,
int dst_width) {
int x;
int32_t stepx = src_stepx * 4;
int32_t data0, data1, data2, data3;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
data0 = LW(src_argb);
data1 = LW(src_argb + stepx);
data2 = LW(src_argb + stepx * 2);
data3 = LW(src_argb + stepx * 3);
SW(data0, dst_argb);
SW(data1, dst_argb + 4);
SW(data2, dst_argb + 8);
SW(data3, dst_argb + 12);
src_argb += stepx * 4;
dst_argb += 16;
}
}
void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_argb,
int dst_width) {
int x;
const uint8_t* nxt_argb = src_argb + src_stride;
int32_t stepx = src_stepx * 4;
int64_t data0, data1, data2, data3;
v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
v16u8 vec0, vec1, vec2, vec3;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 dst0;
for (x = 0; x < dst_width; x += 4) {
data0 = LD(src_argb);
data1 = LD(src_argb + stepx);
data2 = LD(src_argb + stepx * 2);
data3 = LD(src_argb + stepx * 3);
src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
data0 = LD(nxt_argb);
data1 = LD(nxt_argb + stepx);
data2 = LD(nxt_argb + stepx * 2);
data3 = LD(nxt_argb + stepx * 3);
src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
reg0 = __msa_hadd_u_h(vec0, vec0);
reg1 = __msa_hadd_u_h(vec1, vec1);
reg2 = __msa_hadd_u_h(vec2, vec2);
reg3 = __msa_hadd_u_h(vec3, vec3);
reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
reg4 += reg6;
reg5 += reg7;
reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_argb);
src_argb += stepx * 4;
nxt_argb += stepx * 4;
dst_argb += 16;
}
}
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
(void)src_stride;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
(void)src_stride;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
dst0 = __msa_aver_u_b(vec1, vec0);
dst1 = __msa_aver_u_b(vec3, vec2);
ST_UB2(dst0, dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = __msa_hadd_u_h(src0, src0);
vec1 = __msa_hadd_u_h(src1, src1);
vec2 = __msa_hadd_u_h(src2, src2);
vec3 = __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
ST_UB2(dst0, dst1, dst, 16);
s += 64;
t += 64;
dst += 32;
}
}
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst);
src_ptr += 64;
dst += 16;
}
}
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
const uint8_t* s = src_ptr;
const uint8_t* t0 = s + src_stride;
const uint8_t* t1 = s + src_stride * 2;
const uint8_t* t2 = s + src_stride * 3;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
v8u16 vec0, vec1, vec2, vec3;
v4u32 reg0, reg1, reg2, reg3;
for (x = 0; x < dst_width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
vec0 = __msa_hadd_u_h(src0, src0);
vec1 = __msa_hadd_u_h(src1, src1);
vec2 = __msa_hadd_u_h(src2, src2);
vec3 = __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
vec0 += __msa_hadd_u_h(src0, src0);
vec1 += __msa_hadd_u_h(src1, src1);
vec2 += __msa_hadd_u_h(src2, src2);
vec3 += __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
reg0 = __msa_hadd_u_w(vec0, vec0);
reg1 = __msa_hadd_u_w(vec1, vec1);
reg2 = __msa_hadd_u_w(vec2, vec2);
reg3 = __msa_hadd_u_w(vec3, vec3);
reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst);
s += 64;
t0 += 64;
t1 += 64;
t2 += 64;
dst += 16;
}
}
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x, width;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, vec0;
v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
(void)src_stride;
assert(dst_width % 3 == 0);
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
dst0 = __msa_copy_u_d((v2i64)vec0, 0);
dst1 = __msa_copy_u_w((v4i32)vec0, 2);
SD(dst0, dst);
SW(dst1, dst + 8);
src_ptr += 32;
dst += 12;
}
}
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, width;
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, src2, src3, out;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
v8i16 zero = {0};
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
assert((dst_width % 3 == 0) && (dst_width > 0));
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
tmp0 = __msa_hadd_u_w(vec4, vec4);
tmp1 = __msa_hadd_u_w(vec5, vec5);
tmp2 = __msa_hadd_u_w(vec6, vec6);
tmp3 = __msa_hadd_u_w(vec7, vec7);
tmp4 = __msa_hadd_u_w(vec0, vec0);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
tmp0 = __msa_hadd_u_w(vec0, vec0);
tmp1 = __msa_hadd_u_w(vec1, vec1);
tmp0 *= const_0x2AAA;
tmp1 *= const_0x2AAA;
tmp4 *= const_0x4000;
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
dst0 = __msa_copy_u_d((v2i64)out, 0);
dst1 = __msa_copy_u_w((v4i32)out, 2);
SD(dst0, dst_ptr);
SW(dst1, dst_ptr + 8);
s += 32;
t += 32;
dst_ptr += 12;
}
}
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, width;
const uint8_t* s = src_ptr;
const uint8_t* t0 = s + src_stride;
const uint8_t* t1 = s + src_stride * 2;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, src2, src3, src4, src5, out;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
v8u16 zero = {0};
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
assert((dst_width % 3 == 0) && (dst_width > 0));
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
tmp0 = __msa_hadd_u_w(vec4, vec4);
tmp1 = __msa_hadd_u_w(vec5, vec5);
tmp2 = __msa_hadd_u_w(vec6, vec6);
tmp3 = __msa_hadd_u_w(vec7, vec7);
tmp4 = __msa_hadd_u_w(vec0, vec0);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
tmp0 = __msa_hadd_u_w(vec0, vec0);
tmp1 = __msa_hadd_u_w(vec1, vec1);
tmp0 *= const_0x1C71;
tmp1 *= const_0x1C71;
tmp4 *= const_0x2AAA;
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
dst0 = __msa_copy_u_d((v2i64)out, 0);
dst1 = __msa_copy_u_w((v4i32)out, 2);
SD(dst0, dst_ptr);
SW(dst1, dst_ptr + 8);
s += 32;
t0 += 32;
t1 += 32;
dst_ptr += 12;
}
}
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
int x;
v16u8 src0;
v8u16 dst0, dst1;
v16i8 zero = {0};
assert(src_width > 0);
for (x = 0; x < src_width; x += 16) {
src0 = LD_UB(src_ptr);
dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
ST_UH2(dst0, dst1, dst_ptr, 8);
src_ptr += 16;
dst_ptr += 16;
}
}
void ScaleFilterCols_MSA(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
int j;
v4i32 vec_x = __msa_fill_w(x);
v4i32 vec_dx = __msa_fill_w(dx);
v4i32 vec_const = {0, 1, 2, 3};
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v8u16 reg0, reg1;
v16u8 dst0;
v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
v4i32 const_0x40 = __msa_fill_w(0x40);
vec0 = vec_dx * vec_const;
vec1 = vec_dx * 4;
vec_x += vec0;
for (j = 0; j < dst_width - 1; j += 16) {
vec2 = vec_x >> 16;
vec6 = vec_x & const_0xFFFF;
vec_x += vec1;
vec3 = vec_x >> 16;
vec7 = vec_x & const_0xFFFF;
vec_x += vec1;
vec4 = vec_x >> 16;
vec8 = vec_x & const_0xFFFF;
vec_x += vec1;
vec5 = vec_x >> 16;
vec9 = vec_x & const_0xFFFF;
vec_x += vec1;
vec6 >>= 9;
vec7 >>= 9;
vec8 >>= 9;
vec9 >>= 9;
LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
vec2 += 1;
vec3 += 1;
vec4 += 1;
vec5 += 1;
LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
tmp4 -= tmp0;
tmp5 -= tmp1;
tmp6 -= tmp2;
tmp7 -= tmp3;
tmp4 *= vec6;
tmp5 *= vec7;
tmp6 *= vec8;
tmp7 *= vec9;
tmp4 += const_0x40;
tmp5 += const_0x40;
tmp6 += const_0x40;
tmp7 += const_0x40;
tmp4 >>= 7;
tmp5 >>= 7;
tmp6 >>= 7;
tmp7 >>= 7;
tmp0 += tmp4;
tmp1 += tmp5;
tmp2 += tmp6;
tmp3 += tmp7;
reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
__msa_st_b(dst0, dst_ptr, 0);
dst_ptr += 16;
}
}
void ScaleARGBCols_MSA(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
const uint32_t* src = (const uint32_t*)(src_argb);
uint32_t* dst = (uint32_t*)(dst_argb);
int j;
v4i32 x_vec = __msa_fill_w(x);
v4i32 dx_vec = __msa_fill_w(dx);
v4i32 const_vec = {0, 1, 2, 3};
v4i32 vec0, vec1, vec2;
v4i32 dst0;
vec0 = dx_vec * const_vec;
vec1 = dx_vec * 4;
x_vec += vec0;
for (j = 0; j < dst_width; j += 4) {
vec2 = x_vec >> 16;
x_vec += vec1;
LOAD_INDEXED_DATA(src, vec2, dst0);
__msa_st_w(dst0, dst, 0);
dst += 4;
}
}
void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
const uint32_t* src = (const uint32_t*)(src_argb);
int j;
v4u32 src0, src1, src2, src3;
v4u32 vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 mult0, mult1, mult2, mult3;
v8u16 tmp0, tmp1, tmp2, tmp3;
v16u8 dst0, dst1;
v4u32 vec_x = (v4u32)__msa_fill_w(x);
v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
v4u32 vec_const = {0, 1, 2, 3};
v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
vec0 = vec_dx * vec_const;
vec1 = vec_dx * 4;
vec_x += vec0;
for (j = 0; j < dst_width - 1; j += 8) {
vec2 = vec_x >> 16;
reg0 = (v16u8)(vec_x >> 9);
vec_x += vec1;
vec3 = vec_x >> 16;
reg1 = (v16u8)(vec_x >> 9);
vec_x += vec1;
reg0 = reg0 & const_0x7f;
reg1 = reg1 & const_0x7f;
reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
reg2 = reg0 ^ const_0x7f;
reg3 = reg1 ^ const_0x7f;
mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
LOAD_INDEXED_DATA(src, vec2, src0);
LOAD_INDEXED_DATA(src, vec3, src1);
vec2 += 1;
vec3 += 1;
LOAD_INDEXED_DATA(src, vec2, src2);
LOAD_INDEXED_DATA(src, vec3, src3);
reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
tmp0 = __msa_dotp_u_h(reg4, mult0);
tmp1 = __msa_dotp_u_h(reg5, mult1);
tmp2 = __msa_dotp_u_h(reg6, mult2);
tmp3 = __msa_dotp_u_h(reg7, mult3);
tmp0 >>= 7;
tmp1 >>= 7;
tmp2 >>= 7;
tmp3 >>= 7;
dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
__msa_st_b(dst0, dst_argb, 0);
__msa_st_b(dst1, dst_argb, 16);
dst_argb += 32;
}
}
void ScaleRowDown34_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
v16u8 src0, src1, src2, src3;
v16u8 vec0, vec1, vec2;
v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
21, 23, 24, 25, 27, 28, 29, 31};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
__msa_st_b((v16i8)vec0, dst, 0);
__msa_st_b((v16i8)vec1, dst, 16);
__msa_st_b((v16i8)vec2, dst, 32);
src_ptr += 64;
dst += 48;
}
}
void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* d,
int dst_width) {
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
16, 17, 17, 18, 18, 19, 20, 21};
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
reg0 = __msa_srar_h(reg0, shft0);
reg1 = __msa_srar_h(reg1, shft1);
reg2 = __msa_srar_h(reg2, shft2);
reg3 = __msa_srar_h(reg3, shft0);
reg4 = __msa_srar_h(reg4, shft1);
reg5 = __msa_srar_h(reg5, shft2);
reg6 = __msa_srar_h(reg6, shft0);
reg7 = __msa_srar_h(reg7, shft1);
reg8 = __msa_srar_h(reg8, shft2);
reg9 = __msa_srar_h(reg9, shft0);
reg10 = __msa_srar_h(reg10, shft1);
reg11 = __msa_srar_h(reg11, shft2);
reg0 = reg0 * 3 + reg6;
reg1 = reg1 * 3 + reg7;
reg2 = reg2 * 3 + reg8;
reg3 = reg3 * 3 + reg9;
reg4 = reg4 * 3 + reg10;
reg5 = reg5 * 3 + reg11;
reg0 = __msa_srari_h(reg0, 2);
reg1 = __msa_srari_h(reg1, 2);
reg2 = __msa_srari_h(reg2, 2);
reg3 = __msa_srari_h(reg3, 2);
reg4 = __msa_srari_h(reg4, 2);
reg5 = __msa_srari_h(reg5, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
__msa_st_b((v16i8)dst0, d, 0);
__msa_st_b((v16i8)dst1, d, 16);
__msa_st_b((v16i8)dst2, d, 32);
s += 64;
t += 64;
d += 48;
}
}
void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* d,
int dst_width) {
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
16, 17, 17, 18, 18, 19, 20, 21};
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
reg0 = __msa_srar_h(reg0, shft0);
reg1 = __msa_srar_h(reg1, shft1);
reg2 = __msa_srar_h(reg2, shft2);
reg3 = __msa_srar_h(reg3, shft0);
reg4 = __msa_srar_h(reg4, shft1);
reg5 = __msa_srar_h(reg5, shft2);
reg6 = __msa_srar_h(reg6, shft0);
reg7 = __msa_srar_h(reg7, shft1);
reg8 = __msa_srar_h(reg8, shft2);
reg9 = __msa_srar_h(reg9, shft0);
reg10 = __msa_srar_h(reg10, shft1);
reg11 = __msa_srar_h(reg11, shft2);
reg0 += reg6;
reg1 += reg7;
reg2 += reg8;
reg3 += reg9;
reg4 += reg10;
reg5 += reg11;
reg0 = __msa_srari_h(reg0, 1);
reg1 = __msa_srari_h(reg1, 1);
reg2 = __msa_srari_h(reg2, 1);
reg3 = __msa_srari_h(reg3, 1);
reg4 = __msa_srari_h(reg4, 1);
reg5 = __msa_srari_h(reg5, 1);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
__msa_st_b((v16i8)dst0, d, 0);
__msa_st_b((v16i8)dst1, d, 16);
__msa_st_b((v16i8)dst2, d, 32);
s += 64;
t += 64;
d += 48;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff