mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-22 20:30:58 +02:00
274 lines
14 KiB
C
274 lines
14 KiB
C
/*
|
|
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "rotate_row.h"
|
|
#include "row.h"
|
|
|
|
#include "basic_types.h"
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
// This module is for GCC Neon armv8 64 bit.
|
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
|
|
|
void TransposeWx16_NEON(const uint8_t* src,
|
|
int src_stride,
|
|
uint8_t* dst,
|
|
int dst_stride,
|
|
int width) {
|
|
const uint8_t* src_temp;
|
|
asm volatile (
|
|
"1: \n"
|
|
"mov %[src_temp], %[src] \n"
|
|
|
|
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
|
|
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
|
|
|
|
"add %[src], %[src], #16 \n"
|
|
|
|
// Transpose bytes within each 2x2 block.
|
|
"trn1 v0.16b, v16.16b, v17.16b \n"
|
|
"trn2 v1.16b, v16.16b, v17.16b \n"
|
|
"trn1 v2.16b, v18.16b, v19.16b \n"
|
|
"trn2 v3.16b, v18.16b, v19.16b \n"
|
|
"trn1 v4.16b, v20.16b, v21.16b \n"
|
|
"trn2 v5.16b, v20.16b, v21.16b \n"
|
|
"trn1 v6.16b, v22.16b, v23.16b \n"
|
|
"trn2 v7.16b, v22.16b, v23.16b \n"
|
|
"trn1 v8.16b, v24.16b, v25.16b \n"
|
|
"trn2 v9.16b, v24.16b, v25.16b \n"
|
|
"trn1 v10.16b, v26.16b, v27.16b \n"
|
|
"trn2 v11.16b, v26.16b, v27.16b \n"
|
|
"trn1 v12.16b, v28.16b, v29.16b \n"
|
|
"trn2 v13.16b, v28.16b, v29.16b \n"
|
|
"trn1 v14.16b, v30.16b, v31.16b \n"
|
|
"trn2 v15.16b, v30.16b, v31.16b \n"
|
|
|
|
// Transpose 2x2-byte blocks within each 4x4 block.
|
|
"trn1 v16.8h, v0.8h, v2.8h \n"
|
|
"trn1 v17.8h, v1.8h, v3.8h \n"
|
|
"trn2 v18.8h, v0.8h, v2.8h \n"
|
|
"trn2 v19.8h, v1.8h, v3.8h \n"
|
|
"trn1 v20.8h, v4.8h, v6.8h \n"
|
|
"trn1 v21.8h, v5.8h, v7.8h \n"
|
|
"trn2 v22.8h, v4.8h, v6.8h \n"
|
|
"trn2 v23.8h, v5.8h, v7.8h \n"
|
|
"trn1 v24.8h, v8.8h, v10.8h \n"
|
|
"trn1 v25.8h, v9.8h, v11.8h \n"
|
|
"trn2 v26.8h, v8.8h, v10.8h \n"
|
|
"trn2 v27.8h, v9.8h, v11.8h \n"
|
|
"trn1 v28.8h, v12.8h, v14.8h \n"
|
|
"trn1 v29.8h, v13.8h, v15.8h \n"
|
|
"trn2 v30.8h, v12.8h, v14.8h \n"
|
|
"trn2 v31.8h, v13.8h, v15.8h \n"
|
|
|
|
"subs %w[width], %w[width], #16 \n"
|
|
|
|
// Transpose 4x4-byte blocks within each 8x8 block.
|
|
"trn1 v0.4s, v16.4s, v20.4s \n"
|
|
"trn1 v2.4s, v17.4s, v21.4s \n"
|
|
"trn1 v4.4s, v18.4s, v22.4s \n"
|
|
"trn1 v6.4s, v19.4s, v23.4s \n"
|
|
"trn2 v8.4s, v16.4s, v20.4s \n"
|
|
"trn2 v10.4s, v17.4s, v21.4s \n"
|
|
"trn2 v12.4s, v18.4s, v22.4s \n"
|
|
"trn2 v14.4s, v19.4s, v23.4s \n"
|
|
"trn1 v1.4s, v24.4s, v28.4s \n"
|
|
"trn1 v3.4s, v25.4s, v29.4s \n"
|
|
"trn1 v5.4s, v26.4s, v30.4s \n"
|
|
"trn1 v7.4s, v27.4s, v31.4s \n"
|
|
"trn2 v9.4s, v24.4s, v28.4s \n"
|
|
"trn2 v11.4s, v25.4s, v29.4s \n"
|
|
"trn2 v13.4s, v26.4s, v30.4s \n"
|
|
"trn2 v15.4s, v27.4s, v31.4s \n"
|
|
|
|
// Transpose 8x8-byte blocks and store.
|
|
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
|
|
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
|
|
|
|
"b.gt 1b \n"
|
|
: [src] "+r"(src), // %[src]
|
|
[src_temp] "=&r"(src_temp), // %[src_temp]
|
|
[dst] "+r"(dst), // %[dst]
|
|
[width] "+r"(width) // %[width]
|
|
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
|
[dst_stride] "r"((ptrdiff_t)dst_stride) // %[dst_stride]
|
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
|
|
"v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
|
|
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
|
|
"v29", "v30", "v31");
|
|
}
|
|
|
|
void TransposeUVWx8_NEON(const uint8_t* src,
|
|
int src_stride,
|
|
uint8_t* dst_a,
|
|
int dst_stride_a,
|
|
uint8_t* dst_b,
|
|
int dst_stride_b,
|
|
int width) {
|
|
const uint8_t* temp;
|
|
asm volatile (
|
|
// loops are on blocks of 8. loop will stop when
|
|
// counter gets to or below 0. starting the counter
|
|
// at w-8 allow for this
|
|
"sub %w[width], %w[width], #8 \n"
|
|
|
|
"1: \n"
|
|
"mov %[temp], %[src] \n"
|
|
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
|
|
"ld1 {v7.16b}, [%[temp]] \n"
|
|
"add %[src], %[src], #16 \n"
|
|
|
|
"trn1 v16.16b, v0.16b, v1.16b \n"
|
|
"trn2 v17.16b, v0.16b, v1.16b \n"
|
|
"trn1 v18.16b, v2.16b, v3.16b \n"
|
|
"trn2 v19.16b, v2.16b, v3.16b \n"
|
|
"trn1 v20.16b, v4.16b, v5.16b \n"
|
|
"trn2 v21.16b, v4.16b, v5.16b \n"
|
|
"trn1 v22.16b, v6.16b, v7.16b \n"
|
|
"trn2 v23.16b, v6.16b, v7.16b \n"
|
|
|
|
"subs %w[width], %w[width], #8 \n"
|
|
|
|
"trn1 v0.8h, v16.8h, v18.8h \n"
|
|
"trn2 v1.8h, v16.8h, v18.8h \n"
|
|
"trn1 v2.8h, v20.8h, v22.8h \n"
|
|
"trn2 v3.8h, v20.8h, v22.8h \n"
|
|
"trn1 v4.8h, v17.8h, v19.8h \n"
|
|
"trn2 v5.8h, v17.8h, v19.8h \n"
|
|
"trn1 v6.8h, v21.8h, v23.8h \n"
|
|
"trn2 v7.8h, v21.8h, v23.8h \n"
|
|
|
|
"trn1 v16.4s, v0.4s, v2.4s \n"
|
|
"trn2 v17.4s, v0.4s, v2.4s \n"
|
|
"trn1 v18.4s, v1.4s, v3.4s \n"
|
|
"trn2 v19.4s, v1.4s, v3.4s \n"
|
|
"trn1 v20.4s, v4.4s, v6.4s \n"
|
|
"trn2 v21.4s, v4.4s, v6.4s \n"
|
|
"trn1 v22.4s, v5.4s, v7.4s \n"
|
|
"trn2 v23.4s, v5.4s, v7.4s \n"
|
|
|
|
"mov %[temp], %[dst_a] \n"
|
|
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
|
|
"st1 {v19.d}[1], [%[temp]] \n"
|
|
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
|
|
|
"mov %[temp], %[dst_b] \n"
|
|
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
|
|
"st1 {v23.d}[1], [%[temp]] \n"
|
|
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
|
|
|
"b.ge 1b \n"
|
|
: [temp] "=&r"(temp), // %[temp]
|
|
[src] "+r"(src), // %[src]
|
|
[dst_a] "+r"(dst_a), // %[dst_a]
|
|
[dst_b] "+r"(dst_b), // %[dst_b]
|
|
[width] "+r"(width) // %[width]
|
|
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
|
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
|
|
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b) // %[dst_stride_b]
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
|
|
}
|
|
|
|
// Transpose 32 bit values (ARGB)
|
|
void Transpose4x4_32_NEON(const uint8_t* src,
|
|
int src_stride,
|
|
uint8_t* dst,
|
|
int dst_stride,
|
|
int width) {
|
|
const uint8_t* src1 = src + src_stride;
|
|
const uint8_t* src2 = src1 + src_stride;
|
|
const uint8_t* src3 = src2 + src_stride;
|
|
uint8_t* dst1 = dst + dst_stride;
|
|
uint8_t* dst2 = dst1 + dst_stride;
|
|
uint8_t* dst3 = dst2 + dst_stride;
|
|
asm volatile (
|
|
// Main loop transpose 4x4. Read a column, write a row.
|
|
"1: \n"
|
|
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
|
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
|
|
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
|
|
"ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
|
|
"subs %w8, %w8, #4 \n" // w -= 4
|
|
"st1 {v0.4s}, [%4], 16 \n"
|
|
"st1 {v1.4s}, [%5], 16 \n"
|
|
"st1 {v2.4s}, [%6], 16 \n"
|
|
"st1 {v3.4s}, [%7], 16 \n"
|
|
"b.gt 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(src1), // %1
|
|
"+r"(src2), // %2
|
|
"+r"(src3), // %3
|
|
"+r"(dst), // %4
|
|
"+r"(dst1), // %5
|
|
"+r"(dst2), // %6
|
|
"+r"(dst3), // %7
|
|
"+r"(width) // %8
|
|
: "r"((ptrdiff_t)(src_stride * 4)) // %9
|
|
: "memory", "cc", "v0", "v1", "v2", "v3");
|
|
}
|
|
|
|
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|