looper/subprojects/mpg123/src/libmpg123/dct36_avx.S

359 lines
7.4 KiB
ArmAsm
Raw Normal View History

2024-09-28 10:31:06 -07:00
/*
dct36_avx: AVX optimized dct36 for x86-64
copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
see COPYING and AUTHORS files in distribution or http://mpg123.org
initially written by Taihei Monma
*/
#include "mangle.h"
#ifdef IS_MSABI
#define in %rcx
#define out1 %rdx
#define out2 %r8
#define w %r9
#define ts %r10
#define COS9_ %rax
#define tfcos36_ %r11
#else
#define in %rdi
#define out1 %rsi
#define out2 %rdx
#define w %rcx
#define ts %r8
#define COS9_ %rax
#define tfcos36_ %r9
#endif
/*
void dct36_avx(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
*/
#ifndef __APPLE__
.section .rodata
#else
.data
#endif
ALIGN16
dct36_avx_COS9:
.long 0x3f5db3d7
.long 0x3f5db3d7
.long 0x3f000000
.long 0x3f000000
.long 0x3f7c1c5c
.long 0x3f7c1c5c
.long 0x3f708fb2
.long 0x3f708fb2
.long 0x3f248dbb
.long 0x3f248dbb
.long 0x3e31d0d4
.long 0x3e31d0d4
.long 0x3eaf1d44
.long 0x3eaf1d44
.long 0x3f441b7d
.long 0x3f441b7d
ALIGN16
dct36_avx_tfcos36:
.long 0x3f007d2b
.long 0x3f0483ee
.long 0x3f0d3b7d
.long 0x3f1c4257
.long 0x40b79454
.long 0x3ff746ea
.long 0x3f976fd9
.long 0x3f5f2944
.long 0x3f3504f3
ALIGN16
dct36_avx_sign:
.long 0x80000000,0x80000000,0x80000000,0x80000000
.text
ALIGN16
.globl ASM_NAME(INT123_dct36_avx)
ASM_NAME(INT123_dct36_avx):
#ifdef IS_MSABI
push %rbp
mov %rsp, %rbp
sub $160, %rsp
movaps %xmm6, (%rsp)
movaps %xmm7, 16(%rsp)
movaps %xmm8, 32(%rsp)
movaps %xmm9, 48(%rsp)
movaps %xmm10, 64(%rsp)
movaps %xmm11, 80(%rsp)
movaps %xmm12, 96(%rsp)
movaps %xmm13, 112(%rsp)
movaps %xmm14, 128(%rsp)
movaps %xmm15, 144(%rsp)
movq 48(%rbp), ts
#endif
lea dct36_avx_COS9(%rip), COS9_
lea dct36_avx_tfcos36(%rip), tfcos36_
xorps %xmm4, %xmm4
movups (in), %xmm0
movups 16(in), %xmm1
movups 32(in), %xmm2
movups 48(in), %xmm3
movlps 64(in), %xmm4
vshufps $0x93, %xmm0, %xmm0, %xmm5
vshufps $0x93, %xmm1, %xmm1, %xmm6
vshufps $0x93, %xmm2, %xmm2, %xmm7
vshufps $0x93, %xmm3, %xmm3, %xmm8
vshufps $0xe1, %xmm4, %xmm4, %xmm9
movss %xmm8, %xmm9 #[fg--]
addps %xmm9, %xmm4 #[gh--]
movss %xmm7, %xmm8
addps %xmm8, %xmm3 #[cdef]
movss %xmm6, %xmm7
addps %xmm7, %xmm2 #[89ab]
movss %xmm5, %xmm6
addps %xmm6, %xmm1 #[4567]
xorps %xmm6, %xmm6
movss %xmm6, %xmm5
addps %xmm5, %xmm0 #[0123]
vblendps $0x5, %xmm6, %xmm3, %xmm7
vshufps $0x4e, %xmm4, %xmm3, %xmm4
addps %xmm7, %xmm4
vblendps $0x5, %xmm6, %xmm2, %xmm7
vshufps $0x4e, %xmm3, %xmm2, %xmm3
addps %xmm7, %xmm3
vblendps $0x5, %xmm6, %xmm1, %xmm7
vshufps $0x4e, %xmm2, %xmm1, %xmm2
addps %xmm7, %xmm2
vblendps $0x5, %xmm6, %xmm0, %xmm7
vshufps $0x4e, %xmm1, %xmm0, %xmm1
addps %xmm7, %xmm1
vmovlhps %xmm0, %xmm6, %xmm0
/*
xmm0 in[-,-,0,1]
xmm1 in[2,3,4,5]
xmm2 in[6,7,8,9]
xmm3 in[10,11,12,13]
xmm4 in[14,15,16,17]
*/
vblendps $0xc, %xmm3, %xmm2, %xmm5
blendps $0xc, %xmm4, %xmm3
blendps $0xc, %xmm2, %xmm4
movaps %xmm5, %xmm2
/*
xmm2 in[6,7,12,13]
xmm3 in[10,11,16,17]
xmm4 in[14,15,8,9]
*/
movaps (COS9_), %xmm15
movaps 16(COS9_), %xmm6
movaps 32(COS9_), %xmm7
movaps 48(COS9_), %xmm8
vmulps %xmm2, %xmm15, %xmm5
addps %xmm0, %xmm5
/*
xmm5 [ta33,tb33,ta66,tb66]
xmm6 COS9_[1,1,2,2]
xmm7 COS9_[5,5,8,8]
xmm8 COS9_[7,7,4,4]
xmm15 COS9_[3,3,6,6]
*/
vmulps %xmm1, %xmm6, %xmm9
vmulps %xmm3, %xmm7, %xmm12
vmulps %xmm4, %xmm8, %xmm13
addps %xmm5, %xmm9
addps %xmm13, %xmm12
addps %xmm9, %xmm12
vsubps %xmm3, %xmm1, %xmm13
vshufps $0xe0, %xmm2, %xmm0, %xmm14
vsubps %xmm14, %xmm0, %xmm14
subps %xmm4, %xmm13
mulps %xmm15, %xmm13
addps %xmm14, %xmm13
vmulps %xmm1, %xmm7, %xmm9
vmulps %xmm3, %xmm8, %xmm15
vmulps %xmm4, %xmm6, %xmm14
subps %xmm5, %xmm9
subps %xmm15, %xmm14
addps %xmm9, %xmm14
mulps %xmm1, %xmm8
mulps %xmm3, %xmm6
mulps %xmm4, %xmm7
subps %xmm5, %xmm8
subps %xmm7, %xmm6
vaddps %xmm6, %xmm8, %xmm15
movss 32(tfcos36_), %xmm5
subps %xmm1, %xmm0
subps %xmm2, %xmm4
addps %xmm3, %xmm0
addps %xmm4, %xmm0
shufps $0xaf, %xmm0, %xmm0
vmulss %xmm5, %xmm0, %xmm11
/*
xmm12 [1a-0,1b-0, 2a-0, 2b-0]
xmm13 [1a-1,1b-1, 2a-1, 2b-1]
xmm14 [1a-2,1b-2,-2a-2,-2b-2]
xmm15 [1a-3,1b-3,-2a-3,-2b-3]
*/
vunpckhps %xmm13, %xmm12, %xmm5
vunpcklps %xmm13, %xmm12, %xmm12
vunpckhps %xmm15, %xmm14, %xmm6
vunpcklps %xmm15, %xmm14, %xmm14
xorps dct36_avx_sign(%rip), %xmm6
/*
xmm12 [1a-0,1a-1,1b-0,1b-1]
xmm5 [2a-0,2a-1,2b-0,2b-1]
xmm14 [1a-2,1a-3,1b-2,1b-3]
xmm6 [2a-2,2a-3,2b-2,2b-3]
*/
vmovlhps %xmm14, %xmm12, %xmm0
movhlps %xmm12, %xmm14
vmovlhps %xmm6, %xmm5, %xmm1
vmovhlps %xmm5, %xmm6, %xmm15
/*
xmm0 tmp1a
xmm1 tmp2a
xmm14 tmp1b
xmm15 tmp2b
*/
movaps (tfcos36_), %xmm6
movaps 16(tfcos36_), %xmm7
vsubps %xmm14, %xmm15, %xmm10
addps %xmm14, %xmm15
vsubps %xmm0, %xmm1, %xmm14
addps %xmm1, %xmm0
vmulps %xmm6, %xmm15, %xmm1
mulps %xmm10, %xmm7
/*
%xmm0 tmp[0,1,2,3]
%xmm1 tmp[17,16,15,14]
%xmm14 tmp[8,7,6,5]
%xmm7 tmp[9,10,11,12]
%xmm11 tmp[13,-,4,-]
*/
movups 108(w), %xmm2
movups 92(w), %xmm3
shufps $0x1b, %xmm3, %xmm3
movups 36(w), %xmm4
movups 20(w), %xmm5
shufps $0x1b, %xmm5, %xmm5
vsubps %xmm1, %xmm0, %xmm6
addps %xmm1, %xmm0
mulps %xmm0, %xmm2
mulps %xmm3, %xmm0
mulps %xmm6, %xmm4
mulps %xmm5, %xmm6
movups 36(out1), %xmm1
movups 20(out1), %xmm3
shufps $0x1b, %xmm6, %xmm6
addps %xmm4, %xmm1
addps %xmm6, %xmm3
shufps $0x1b, %xmm0, %xmm0
movups %xmm2, 36(out2)
movups %xmm0, 20(out2)
movss %xmm1, 32*36(ts)
movss %xmm3, 32*20(ts)
movhlps %xmm1, %xmm2
movhlps %xmm3, %xmm4
movss %xmm2, 32*44(ts)
movss %xmm4, 32*28(ts)
shufps $0xb1, %xmm1, %xmm1
shufps $0xb1, %xmm3, %xmm3
movss %xmm1, 32*40(ts)
movss %xmm3, 32*24(ts)
movhlps %xmm1, %xmm2
movhlps %xmm3, %xmm4
movss %xmm2, 32*48(ts)
movss %xmm4, 32*32(ts)
movhlps %xmm11, %xmm0
movss 124(w), %xmm2
movss 88(w), %xmm3
movss 52(w), %xmm4
movss 16(w), %xmm5
movss %xmm0, %xmm6
addss %xmm11, %xmm0
subss %xmm11, %xmm6
mulss %xmm0, %xmm2
mulss %xmm3, %xmm0
mulss %xmm6, %xmm4
mulss %xmm5, %xmm6
addss 52(out1), %xmm4
addss 16(out1), %xmm6
movss %xmm2, 52(out2)
movss %xmm0, 16(out2)
movss %xmm4, 32*52(ts)
movss %xmm6, 32*16(ts)
movaps %xmm14, %xmm0
movaps %xmm7, %xmm1
MOVUAPS 128(w), %xmm2
movups 72(w), %xmm3
shufps $0x1b, %xmm2, %xmm2
movlps 56(w), %xmm4
movhps 64(w), %xmm4
MOVUAPS (w), %xmm5
shufps $0x1b, %xmm4, %xmm4
vsubps %xmm1, %xmm0, %xmm6
addps %xmm1, %xmm0
mulps %xmm0, %xmm2
mulps %xmm3, %xmm0
mulps %xmm6, %xmm4
mulps %xmm5, %xmm6
movlps 56(out1), %xmm1
movhps 64(out1), %xmm1
movups (out1), %xmm3
shufps $0x1b, %xmm4, %xmm4
addps %xmm6, %xmm3
addps %xmm4, %xmm1
shufps $0x1b, %xmm2, %xmm2
movups %xmm0, (out2)
movlps %xmm2, 56(out2)
movhps %xmm2, 64(out2)
movss %xmm1, 32*56(ts)
movss %xmm3, (ts)
movhlps %xmm1, %xmm2
movhlps %xmm3, %xmm4
movss %xmm2, 32*64(ts)
movss %xmm4, 32*8(ts)
shufps $0xb1, %xmm1, %xmm1
shufps $0xb1, %xmm3, %xmm3
movss %xmm1, 32*60(ts)
movss %xmm3, 32*4(ts)
movhlps %xmm1, %xmm2
movhlps %xmm3, %xmm4
movss %xmm2, 32*68(ts)
movss %xmm4, 32*12(ts)
#ifdef IS_MSABI
movaps (%rsp), %xmm6
movaps 16(%rsp), %xmm7
movaps 32(%rsp), %xmm8
movaps 48(%rsp), %xmm9
movaps 64(%rsp), %xmm10
movaps 80(%rsp), %xmm11
movaps 96(%rsp), %xmm12
movaps 112(%rsp), %xmm13
movaps 128(%rsp), %xmm14
movaps 144(%rsp), %xmm15
mov %rbp, %rsp
pop %rbp
#endif
ret
NONEXEC_STACK