looper/subprojects/mpg123/src/libmpg123/dct64_neon64.S

300 lines
9.1 KiB
ArmAsm
Raw Normal View History

2024-09-28 10:31:06 -07:00
/*
dct64_neon64: NEON optimized dct64 for AArch64
copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
see COPYING and AUTHORS files in distribution or http://mpg123.org
initially written by Taihei Monma
*/
#include "mangle.h"
#ifndef __APPLE__
.section .rodata
#else
.data
#endif
ALIGN16
costab_neon_aarch64:
.word 1056974725
.word 1057056395
.word 1057223771
.word 1057485416
.word 1057855544
.word 1058356026
.word 1059019886
.word 1059897405
.word 1061067246
.word 1062657950
.word 1064892987
.word 1066774581
.word 1069414683
.word 1073984175
.word 1079645762
.word 1092815430
.word 1057005197
.word 1057342072
.word 1058087743
.word 1059427869
.word 1061799040
.word 1065862217
.word 1071413542
.word 1084439708
.word 1057128951
.word 1058664893
.word 1063675095
.word 1076102863
.word 1057655764
.word 1067924853
.word 1060439283
.word 1060439283
.text
ALIGN4
.globl ASM_NAME(INT123_dct64_neon64)
#ifdef __ELF__
.type ASM_NAME(INT123_dct64_neon64), %function
#endif
ASM_NAME(INT123_dct64_neon64):
add x3, x2, #64
adrp x4, AARCH64_PCREL_HI(costab_neon_aarch64)
add x4, x4, AARCH64_PCREL_LO(costab_neon_aarch64)
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2]
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x3]
ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64
rev64 v19.4s, v19.4s
rev64 v18.4s, v18.4s
rev64 v17.4s, v17.4s
rev64 v16.4s, v16.4s
ext v4.16b, v19.16b, v19.16b, #8
ext v5.16b, v18.16b, v18.16b, #8
ext v6.16b, v17.16b, v17.16b, #8
ext v7.16b, v16.16b, v16.16b, #8
fsub v16.4s, v3.4s, v7.4s
fsub v17.4s, v2.4s, v6.4s
fsub v18.4s, v1.4s, v5.4s
fsub v19.4s, v0.4s, v4.4s
fadd v0.4s, v0.4s, v4.4s /* bs[0,1,2,3] */
fadd v1.4s, v1.4s, v5.4s /* bs[4,5,6,7] */
fadd v2.4s, v2.4s, v6.4s /* bs[8,9,10,11] */
fadd v3.4s, v3.4s, v7.4s /* bs[12,13,14,15] */
fmul v16.4s, v16.4s, v23.4s /* bs[19,18,17,16] */
fmul v17.4s, v17.4s, v22.4s /* bs[23,22,21,20] */
fmul v18.4s, v18.4s, v21.4s /* bs[27,26,25,24] */
fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */
ld1 {v20.4s, v21.4s}, [x4], #32
rev64 v22.4s, v3.4s
rev64 v23.4s, v2.4s
rev64 v24.4s, v16.4s
rev64 v25.4s, v17.4s
ext v4.16b, v22.16b, v22.16b, #8 /* bs[15,14,13,12] */
ext v5.16b, v23.16b, v23.16b, #8 /* bs[11,10,9,8] */
ext v6.16b, v24.16b, v24.16b, #8 /* bs[16,17,18,19] */
ext v7.16b, v25.16b, v25.16b, #8 /* bs[20,21,22,23] */
fsub v26.4s, v1.4s, v5.4s
fsub v27.4s, v0.4s, v4.4s
fsub v28.4s, v18.4s, v7.4s
fsub v29.4s, v19.4s, v6.4s
fadd v4.4s, v0.4s, v4.4s /* bs[32,33,34,35] */
fadd v5.4s, v1.4s, v5.4s /* bs[36,37,38,39] */
fadd v6.4s, v6.4s, v19.4s /* bs[48,49,50,51] */
fadd v7.4s, v7.4s, v18.4s /* bs[52,53,54,55] */
fmul v26.4s, v26.4s, v21.4s /* bs[43,42,41,40] */
fmul v27.4s, v27.4s, v20.4s /* bs[47,46,45,44] */
fmul v28.4s, v28.4s, v21.4s /* bs[59,58,57,56] */
fmul v29.4s, v29.4s, v20.4s /* bs[63,62,61,60] */
ld1 {v20.4s}, [x4], #16
rev64 v16.4s, v5.4s
rev64 v17.4s, v26.4s
rev64 v18.4s, v7.4s
rev64 v19.4s, v28.4s
ext v0.16b, v16.16b, v16.16b, #8 /* bs[39,38,37,36] */
ext v1.16b, v17.16b, v17.16b, #8 /* bs[40,41,42,43] */
ext v2.16b, v18.16b, v18.16b, #8 /* bs[55,54,53,52] */
ext v3.16b, v19.16b, v19.16b, #8 /* bs[56,57,58,59] */
fsub v16.4s, v4.4s, v0.4s
fsub v17.4s, v27.4s, v1.4s
fsub v18.4s, v6.4s, v2.4s
fsub v19.4s, v29.4s, v3.4s
fadd v0.4s, v4.4s, v0.4s /* bs[0,1,2,3] */
fadd v1.4s, v1.4s, v27.4s /* bs[8,9,10,11] */
fadd v2.4s, v6.4s, v2.4s /* bs[16,17,18,19] */
fadd v3.4s, v3.4s, v29.4s /* bs[24,25,26,27] */
fmul v16.4s, v16.4s, v20.4s /* bs[7,6,5,4] */
fmul v17.4s, v17.4s, v20.4s /* bs[15,14,13,12] */
fmul v18.4s, v18.4s, v20.4s /* bs[23,22,21,20] */
fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */
ld1 {v28.4s}, [x4]
zip1 v4.2d, v0.2d, v16.2d /* bs[0,1,7,6] */
zip2 v5.2d, v0.2d, v16.2d /* bs[2,3,5,4] */
zip1 v6.2d, v1.2d, v17.2d /* bs[8,9,15,14] */
zip2 v7.2d, v1.2d, v17.2d /* bs[10,11,13,12] */
zip1 v20.2d, v2.2d, v18.2d /* bs[16,17,23,22] */
zip2 v21.2d, v2.2d, v18.2d /* bs[18,19,21,20] */
zip1 v22.2d, v3.2d, v19.2d /* bs[24,25,31,30] */
zip2 v23.2d, v3.2d, v19.2d /* bs[26,27,29,28] */
rev64 v5.4s, v5.4s /* bs[3,2,4,5] */
rev64 v7.4s, v7.4s /* bs[11,10,12,13] */
rev64 v21.4s, v21.4s /* bs[19,18,20,21] */
rev64 v23.4s, v23.4s /* bs[27,26,28,29] */
AARCH64_DUP_2D(v29, v28, 0)
AARCH64_DUP_4S(v28, v28, 2)
fsub v16.4s, v4.4s, v5.4s
fsub v17.4s, v6.4s, v7.4s
fsub v18.4s, v20.4s, v21.4s
fsub v19.4s, v22.4s, v23.4s
fadd v0.4s, v4.4s, v5.4s /* bs[32,33,36,37] */
fadd v1.4s, v6.4s, v7.4s /* bs[40,41,44,45] */
fadd v2.4s, v20.4s, v21.4s /* bs[48,49,52,53] */
fadd v3.4s, v22.4s, v23.4s /* bs[56,57,60,61] */
fmul v16.4s, v16.4s, v29.4s /* bs[35,34,39,38] */
fmul v17.4s, v17.4s, v29.4s /* bs[43,42,47,46] */
fmul v18.4s, v18.4s, v29.4s /* bs[51,50,55,54] */
fmul v19.4s, v19.4s, v29.4s /* bs[59,58,63,62] */
uzp1 v4.4s, v0.4s, v16.4s /* bs[32,36,35,39] */
uzp2 v5.4s, v0.4s, v16.4s /* bs[33,37,34,38] */
uzp1 v6.4s, v1.4s, v17.4s /* bs[40,44,43,47] */
uzp2 v7.4s, v1.4s, v17.4s /* bs[41,45,42,46] */
uzp1 v20.4s, v2.4s, v18.4s /* bs[48,52,51,55] */
uzp2 v21.4s, v2.4s, v18.4s /* bs[49,53,50,54] */
uzp1 v22.4s, v3.4s, v19.4s /* bs[56,60,59,63] */
uzp2 v23.4s, v3.4s, v19.4s /* bs[57,61,58,62] */
fsub v16.4s, v4.4s, v5.4s
fsub v17.4s, v6.4s, v7.4s
fsub v18.4s, v20.4s, v21.4s
fsub v19.4s, v22.4s, v23.4s
fadd v0.4s, v4.4s, v5.4s /* bs[0,4,2,6] */
fadd v1.4s, v6.4s, v7.4s /* bs[8,12,10,14] */
fadd v2.4s, v20.4s, v21.4s /* bs[16,20,18,22] */
fadd v3.4s, v22.4s, v23.4s /* bs[24,28,26,30] */
fmul v16.4s, v16.4s, v28.4s /* bs[1,5,3,7] */
fmul v17.4s, v17.4s, v28.4s /* bs[9,13,11,15] */
fmul v18.4s, v18.4s, v28.4s /* bs[17,21,19,23] */
fmul v19.4s, v19.4s, v28.4s /* bs[25,29,27,31] */
zip2 v4.2d, v0.2d, v1.2d /* bs[2,6,10,14] */
zip2 v5.2d, v16.2d, v17.2d /* bs[3,7,11,15] */
zip2 v6.2d, v2.2d, v3.2d /* bs[18,22,26,30] */
zip2 v7.2d, v18.2d, v19.2d /* bs[19,23,27,31] */
fadd v4.4s, v4.4s, v5.4s /* bs[2,6,10,14] */
fadd v6.4s, v6.4s, v7.4s /* bs[18,22,26,30] */
ins v0.d[1], v4.d[0] /* bs[0,4,2,6] */
ins v1.d[1], v4.d[1] /* bs[8,12,10,14] */
ins v2.d[1], v6.d[0] /* bs[16,20,18,22] */
ins v3.d[1], v6.d[1] /* bs[24,28,26,30] */
eor v31.16b, v31.16b, v31.16b
zip1 v4.4s, v0.4s, v16.4s /* bs[0,1,4,5] */
zip2 v5.4s, v0.4s, v16.4s /* bs[2,3,6,7] */
zip1 v6.4s, v1.4s, v17.4s /* bs[8,9,12,13] */
zip2 v7.4s, v1.4s, v17.4s /* bs[10,11,14,15] */
zip1 v20.4s, v2.4s, v18.4s /* bs[16,17,20,21] */
zip2 v21.4s, v2.4s, v18.4s /* bs[18,19,22,23] */
zip1 v22.4s, v3.4s, v19.4s /* bs[24,25,28,29] */
zip2 v23.4s, v3.4s, v19.4s /* bs[26,27,30,31] */
zip1 v0.2d, v4.2d, v5.2d /* bs[0,1,2,3] */
zip2 v1.2d, v4.2d, v5.2d /* bs[4,5,6,7] */
zip1 v2.2d, v6.2d, v7.2d /* bs[8,9,10,11] */
zip2 v3.2d, v6.2d, v7.2d /* bs[12,13,14,15] */
rev64 v16.4s, v4.4s
rev64 v17.4s, v6.4s
zip1 v24.2d, v7.2d, v17.2d
zip2 v16.2d, v5.2d, v16.2d
zip2 v17.2d, v7.2d, v17.2d
zip1 v4.2d, v20.2d, v21.2d /* bs[16,17,18,19] */
zip2 v5.2d, v20.2d, v21.2d /* bs[20,21,22,23] */
zip1 v6.2d, v22.2d, v23.2d /* bs[24,25,26,27] */
zip2 v7.2d, v22.2d, v23.2d /* bs[28,29,30,31] */
rev64 v18.4s, v20.4s
rev64 v19.4s, v22.4s
zip1 v25.2d, v23.2d, v19.2d
zip1 v26.2d, v21.2d, v18.2d
zip2 v18.2d, v21.2d, v18.2d
zip2 v19.2d, v23.2d, v19.2d
ins v16.s[3], v31.s[0] /* bs[6,7,5,-] */
ins v17.s[3], v31.s[0] /* bs[14,15,13,-] */
ins v18.s[3], v31.s[0] /* bs[22,23,21,-] */
ins v19.s[3], v31.s[0] /* bs[30,31,29,-] */
ins v24.s[3], v31.s[0] /* bs[10,11,9,-] */
ins v25.s[3], v31.s[0] /* bs[26,27,25,-] */
ins v26.s[3], v31.s[0] /* bs[18,19,17,-] */
fadd v1.4s, v1.4s, v16.4s
fadd v3.4s, v3.4s, v17.4s
fadd v5.4s, v5.4s, v18.4s
fadd v7.4s, v7.4s, v19.4s
fadd v2.4s, v2.4s, v3.4s
fadd v3.4s, v3.4s, v24.4s
fadd v6.4s, v6.4s, v7.4s
fadd v7.4s, v7.4s, v25.4s
fadd v4.4s, v4.4s, v6.4s
fadd v6.4s, v6.4s, v5.4s
fadd v5.4s, v5.4s, v7.4s
fadd v7.4s, v7.4s, v26.4s
fcvtns v0.4s, v0.4s
fcvtns v1.4s, v1.4s
fcvtns v2.4s, v2.4s
fcvtns v3.4s, v3.4s
fcvtns v4.4s, v4.4s
fcvtns v5.4s, v5.4s
fcvtns v6.4s, v6.4s
fcvtns v7.4s, v7.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtn v2.4h, v2.4s
sqxtn v3.4h, v3.4s
sqxtn v4.4h, v4.4s
sqxtn v5.4h, v5.4s
sqxtn v6.4h, v6.4s
sqxtn v7.4h, v7.4s
mov x3, #32
st1 {v0.h}[1], [x0], x3
st1 {v7.h}[2], [x0], x3
st1 {v3.h}[2], [x0], x3
st1 {v5.h}[2], [x0], x3
st1 {v1.h}[2], [x0], x3
st1 {v6.h}[2], [x0], x3
st1 {v2.h}[2], [x0], x3
st1 {v4.h}[2], [x0], x3
st1 {v0.h}[2], [x0], x3
st1 {v7.h}[0], [x0], x3
st1 {v3.h}[0], [x0], x3
st1 {v5.h}[0], [x0], x3
st1 {v1.h}[0], [x0], x3
st1 {v6.h}[0], [x0], x3
st1 {v2.h}[0], [x0], x3
st1 {v4.h}[0], [x0], x3
st1 {v0.h}[0], [x0]
st1 {v0.h}[1], [x1], x3
st1 {v4.h}[1], [x1], x3
st1 {v2.h}[1], [x1], x3
st1 {v6.h}[1], [x1], x3
st1 {v1.h}[1], [x1], x3
st1 {v5.h}[1], [x1], x3
st1 {v3.h}[1], [x1], x3
st1 {v7.h}[1], [x1], x3
st1 {v0.h}[3], [x1], x3
st1 {v4.h}[3], [x1], x3
st1 {v2.h}[3], [x1], x3
st1 {v6.h}[3], [x1], x3
st1 {v1.h}[3], [x1], x3
st1 {v5.h}[3], [x1], x3
st1 {v3.h}[3], [x1], x3
st1 {v7.h}[3], [x1]
ret
NONEXEC_STACK