/* * Copyright 2024 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "rotate_row.h" #include "row.h" #include "basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) __arm_locally_streaming __arm_new("za") void TransposeWxH_SME( const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { int vl; asm("cntb %x0" : "=r"(vl)); do { const uint8_t* src2 = src; uint8_t* dst2 = dst; // Process up to VL elements per iteration of the inner loop. int block_height = height > vl ? vl : height; int width2 = width; do { const uint8_t* src3 = src2; // Process up to VL elements per iteration of the inner loop. int block_width = width2 > vl ? vl : width2; asm volatile( "mov w12, #0 \n" // Create a predicate to handle loading partial rows. "whilelt p0.b, wzr, %w[block_width] \n" // Load H <= VL rows into ZA0. "1: \n" "ld1b {za0h.b[w12, 0]}, p0/z, [%[src3]] \n" "add %[src3], %[src3], %[src_stride] \n" "add w12, w12, #1 \n" "cmp w12, %w[block_height] \n" "b.ne 1b \n" // Create a predicate to handle storing partial columns. "whilelt p0.b, wzr, %w[block_height] \n" "mov w12, #0 \n" // Store W <= VL columns from ZA0. "2: \n" "st1b {za0v.b[w12, 0]}, p0, [%[dst2]] \n" "add %[dst2], %[dst2], %[dst_stride] \n" "add w12, w12, #1 \n" "cmp w12, %w[block_width] \n" "b.ne 2b \n" : [src3] "+r"(src3), // %[src3] [dst2] "+r"(dst2) // %[dst2] : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] [dst_stride] "r"((ptrdiff_t)dst_stride), // %[dst_stride] [block_width] "r"(block_width), // %[block_width] [block_height] "r"(block_height) // %[block_height] : "cc", "memory", "p0", "w12", "za"); src2 += vl; width2 -= vl; } while (width2 > 0); src += vl * src_stride; dst += vl; height -= vl; } while (height > 0); } __arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME( const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { int vl; asm("cnth %x0" : "=r"(vl)); do { const uint8_t* src2 = src; uint8_t* dst2_a = dst_a; uint8_t* dst2_b = dst_b; // Process up to VL bytes per iteration of the inner loop. int block_height = height > vl * 2 ? vl * 2 : height; int width2 = width; do { const uint8_t* src3 = src2; // Process up to VL 16-bit elements per iteration of the inner loop. int block_width = width2 > vl ? vl : width2; asm volatile( "mov w12, #0 \n" // Create a predicate to handle loading partial rows, // %[block_width] is always a multiple of two here. "whilelt p0.b, wzr, %w[block_width] \n" // Load H <= VL rows into ZA0, such that U/V components exist in // alternating columns. "1: \n" "ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n" "add %[src], %[src], %[src_stride] \n" "add w12, w12, #1 \n" "cmp w12, %w[block_height] \n" "b.ne 1b \n" // Create a predicate to handle storing partial columns. "whilelt p0.b, wzr, %w[block_height] \n" "mov w12, #0 \n" // Store alternating UV data from pairs of ZA0 columns. "2: \n" "st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n" "st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n" "add %[dst_a], %[dst_a], %[dst_stride_a] \n" "add %[dst_b], %[dst_b], %[dst_stride_b] \n" "add w12, w12, #2 \n" "cmp w12, %w[block_width] \n" "b.ne 2b \n" : [src] "+r"(src3), // %[src] [dst_a] "+r"(dst2_a), // %[dst_a] [dst_b] "+r"(dst2_b) // %[dst_b] : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] [dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a] [dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b] [block_width] "r"(block_width * 2), // %[block_width] [block_height] "r"(block_height) // %[block_height] : "cc", "memory", "p0", "w12", "za"); src2 += 2 * vl; width2 -= vl; } while (width2 > 0); src += 2 * vl * src_stride; dst_a += 2 * vl; dst_b += 2 * vl; height -= 2 * vl; } while (height > 0); } #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif