mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-22 20:30:58 +02:00
175 lines
6.1 KiB
C
175 lines
6.1 KiB
C
/*
|
|
* Copyright 2024 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "rotate_row.h"
|
|
#include "row.h"
|
|
|
|
#include "basic_types.h"
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
|
|
defined(__aarch64__)
|
|
|
|
__arm_locally_streaming __arm_new("za") void TransposeWxH_SME(
|
|
const uint8_t* src,
|
|
int src_stride,
|
|
uint8_t* dst,
|
|
int dst_stride,
|
|
int width,
|
|
int height) {
|
|
int vl;
|
|
asm("cntb %x0" : "=r"(vl));
|
|
|
|
do {
|
|
const uint8_t* src2 = src;
|
|
uint8_t* dst2 = dst;
|
|
|
|
// Process up to VL elements per iteration of the inner loop.
|
|
int block_height = height > vl ? vl : height;
|
|
|
|
int width2 = width;
|
|
do {
|
|
const uint8_t* src3 = src2;
|
|
|
|
// Process up to VL elements per iteration of the inner loop.
|
|
int block_width = width2 > vl ? vl : width2;
|
|
|
|
asm volatile(
|
|
"mov w12, #0 \n"
|
|
|
|
// Create a predicate to handle loading partial rows.
|
|
"whilelt p0.b, wzr, %w[block_width] \n"
|
|
|
|
// Load H <= VL rows into ZA0.
|
|
"1: \n"
|
|
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src3]] \n"
|
|
"add %[src3], %[src3], %[src_stride] \n"
|
|
"add w12, w12, #1 \n"
|
|
"cmp w12, %w[block_height] \n"
|
|
"b.ne 1b \n"
|
|
|
|
// Create a predicate to handle storing partial columns.
|
|
"whilelt p0.b, wzr, %w[block_height] \n"
|
|
"mov w12, #0 \n"
|
|
|
|
// Store W <= VL columns from ZA0.
|
|
"2: \n"
|
|
"st1b {za0v.b[w12, 0]}, p0, [%[dst2]] \n"
|
|
"add %[dst2], %[dst2], %[dst_stride] \n"
|
|
"add w12, w12, #1 \n"
|
|
"cmp w12, %w[block_width] \n"
|
|
"b.ne 2b \n"
|
|
: [src3] "+r"(src3), // %[src3]
|
|
[dst2] "+r"(dst2) // %[dst2]
|
|
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
|
[dst_stride] "r"((ptrdiff_t)dst_stride), // %[dst_stride]
|
|
[block_width] "r"(block_width), // %[block_width]
|
|
[block_height] "r"(block_height) // %[block_height]
|
|
: "cc", "memory", "p0", "w12", "za");
|
|
|
|
src2 += vl;
|
|
width2 -= vl;
|
|
} while (width2 > 0);
|
|
|
|
src += vl * src_stride;
|
|
dst += vl;
|
|
height -= vl;
|
|
} while (height > 0);
|
|
}
|
|
|
|
__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME(
|
|
const uint8_t* src,
|
|
int src_stride,
|
|
uint8_t* dst_a,
|
|
int dst_stride_a,
|
|
uint8_t* dst_b,
|
|
int dst_stride_b,
|
|
int width,
|
|
int height) {
|
|
int vl;
|
|
asm("cnth %x0" : "=r"(vl));
|
|
|
|
do {
|
|
const uint8_t* src2 = src;
|
|
uint8_t* dst2_a = dst_a;
|
|
uint8_t* dst2_b = dst_b;
|
|
|
|
// Process up to VL bytes per iteration of the inner loop.
|
|
int block_height = height > vl * 2 ? vl * 2 : height;
|
|
|
|
int width2 = width;
|
|
do {
|
|
const uint8_t* src3 = src2;
|
|
|
|
// Process up to VL 16-bit elements per iteration of the inner loop.
|
|
int block_width = width2 > vl ? vl : width2;
|
|
|
|
asm volatile(
|
|
"mov w12, #0 \n"
|
|
|
|
// Create a predicate to handle loading partial rows,
|
|
// %[block_width] is always a multiple of two here.
|
|
"whilelt p0.b, wzr, %w[block_width] \n"
|
|
|
|
// Load H <= VL rows into ZA0, such that U/V components exist in
|
|
// alternating columns.
|
|
"1: \n"
|
|
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n"
|
|
"add %[src], %[src], %[src_stride] \n"
|
|
"add w12, w12, #1 \n"
|
|
"cmp w12, %w[block_height] \n"
|
|
"b.ne 1b \n"
|
|
|
|
// Create a predicate to handle storing partial columns.
|
|
"whilelt p0.b, wzr, %w[block_height] \n"
|
|
"mov w12, #0 \n"
|
|
|
|
// Store alternating UV data from pairs of ZA0 columns.
|
|
"2: \n"
|
|
"st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n"
|
|
"st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n"
|
|
"add %[dst_a], %[dst_a], %[dst_stride_a] \n"
|
|
"add %[dst_b], %[dst_b], %[dst_stride_b] \n"
|
|
"add w12, w12, #2 \n"
|
|
"cmp w12, %w[block_width] \n"
|
|
"b.ne 2b \n"
|
|
: [src] "+r"(src3), // %[src]
|
|
[dst_a] "+r"(dst2_a), // %[dst_a]
|
|
[dst_b] "+r"(dst2_b) // %[dst_b]
|
|
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
|
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
|
|
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b]
|
|
[block_width] "r"(block_width * 2), // %[block_width]
|
|
[block_height] "r"(block_height) // %[block_height]
|
|
: "cc", "memory", "p0", "w12", "za");
|
|
|
|
src2 += 2 * vl;
|
|
width2 -= vl;
|
|
} while (width2 > 0);
|
|
|
|
src += 2 * vl * src_stride;
|
|
dst_a += 2 * vl;
|
|
dst_b += 2 * vl;
|
|
height -= 2 * vl;
|
|
} while (height > 0);
|
|
}
|
|
|
|
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
|
|
// defined(__aarch64__)
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|