358 lines
7.4 KiB
ArmAsm
358 lines
7.4 KiB
ArmAsm
/*
|
|
dct36_avx: AVX optimized dct36 for x86-64
|
|
|
|
copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
initially written by Taihei Monma
|
|
*/
|
|
|
|
#include "mangle.h"
|
|
|
|
#ifdef IS_MSABI
|
|
#define in %rcx
|
|
#define out1 %rdx
|
|
#define out2 %r8
|
|
#define w %r9
|
|
#define ts %r10
|
|
#define COS9_ %rax
|
|
#define tfcos36_ %r11
|
|
#else
|
|
#define in %rdi
|
|
#define out1 %rsi
|
|
#define out2 %rdx
|
|
#define w %rcx
|
|
#define ts %r8
|
|
#define COS9_ %rax
|
|
#define tfcos36_ %r9
|
|
#endif
|
|
|
|
/*
|
|
void dct36_avx(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
|
|
*/
|
|
|
|
#ifndef __APPLE__
|
|
.section .rodata
|
|
#else
|
|
.data
|
|
#endif
|
|
ALIGN16
|
|
dct36_avx_COS9:
|
|
.long 0x3f5db3d7
|
|
.long 0x3f5db3d7
|
|
.long 0x3f000000
|
|
.long 0x3f000000
|
|
.long 0x3f7c1c5c
|
|
.long 0x3f7c1c5c
|
|
.long 0x3f708fb2
|
|
.long 0x3f708fb2
|
|
.long 0x3f248dbb
|
|
.long 0x3f248dbb
|
|
.long 0x3e31d0d4
|
|
.long 0x3e31d0d4
|
|
.long 0x3eaf1d44
|
|
.long 0x3eaf1d44
|
|
.long 0x3f441b7d
|
|
.long 0x3f441b7d
|
|
ALIGN16
|
|
dct36_avx_tfcos36:
|
|
.long 0x3f007d2b
|
|
.long 0x3f0483ee
|
|
.long 0x3f0d3b7d
|
|
.long 0x3f1c4257
|
|
.long 0x40b79454
|
|
.long 0x3ff746ea
|
|
.long 0x3f976fd9
|
|
.long 0x3f5f2944
|
|
.long 0x3f3504f3
|
|
ALIGN16
|
|
dct36_avx_sign:
|
|
.long 0x80000000,0x80000000,0x80000000,0x80000000
|
|
.text
|
|
ALIGN16
|
|
.globl ASM_NAME(INT123_dct36_avx)
|
|
ASM_NAME(INT123_dct36_avx):
|
|
#ifdef IS_MSABI
|
|
push %rbp
|
|
mov %rsp, %rbp
|
|
sub $160, %rsp
|
|
movaps %xmm6, (%rsp)
|
|
movaps %xmm7, 16(%rsp)
|
|
movaps %xmm8, 32(%rsp)
|
|
movaps %xmm9, 48(%rsp)
|
|
movaps %xmm10, 64(%rsp)
|
|
movaps %xmm11, 80(%rsp)
|
|
movaps %xmm12, 96(%rsp)
|
|
movaps %xmm13, 112(%rsp)
|
|
movaps %xmm14, 128(%rsp)
|
|
movaps %xmm15, 144(%rsp)
|
|
movq 48(%rbp), ts
|
|
#endif
|
|
lea dct36_avx_COS9(%rip), COS9_
|
|
lea dct36_avx_tfcos36(%rip), tfcos36_
|
|
|
|
xorps %xmm4, %xmm4
|
|
movups (in), %xmm0
|
|
movups 16(in), %xmm1
|
|
movups 32(in), %xmm2
|
|
movups 48(in), %xmm3
|
|
movlps 64(in), %xmm4
|
|
vshufps $0x93, %xmm0, %xmm0, %xmm5
|
|
vshufps $0x93, %xmm1, %xmm1, %xmm6
|
|
vshufps $0x93, %xmm2, %xmm2, %xmm7
|
|
vshufps $0x93, %xmm3, %xmm3, %xmm8
|
|
vshufps $0xe1, %xmm4, %xmm4, %xmm9
|
|
movss %xmm8, %xmm9 #[fg--]
|
|
addps %xmm9, %xmm4 #[gh--]
|
|
movss %xmm7, %xmm8
|
|
addps %xmm8, %xmm3 #[cdef]
|
|
movss %xmm6, %xmm7
|
|
addps %xmm7, %xmm2 #[89ab]
|
|
movss %xmm5, %xmm6
|
|
addps %xmm6, %xmm1 #[4567]
|
|
xorps %xmm6, %xmm6
|
|
movss %xmm6, %xmm5
|
|
addps %xmm5, %xmm0 #[0123]
|
|
|
|
vblendps $0x5, %xmm6, %xmm3, %xmm7
|
|
vshufps $0x4e, %xmm4, %xmm3, %xmm4
|
|
addps %xmm7, %xmm4
|
|
vblendps $0x5, %xmm6, %xmm2, %xmm7
|
|
vshufps $0x4e, %xmm3, %xmm2, %xmm3
|
|
addps %xmm7, %xmm3
|
|
vblendps $0x5, %xmm6, %xmm1, %xmm7
|
|
vshufps $0x4e, %xmm2, %xmm1, %xmm2
|
|
addps %xmm7, %xmm2
|
|
vblendps $0x5, %xmm6, %xmm0, %xmm7
|
|
vshufps $0x4e, %xmm1, %xmm0, %xmm1
|
|
addps %xmm7, %xmm1
|
|
vmovlhps %xmm0, %xmm6, %xmm0
|
|
|
|
/*
|
|
xmm0 in[-,-,0,1]
|
|
xmm1 in[2,3,4,5]
|
|
xmm2 in[6,7,8,9]
|
|
xmm3 in[10,11,12,13]
|
|
xmm4 in[14,15,16,17]
|
|
*/
|
|
|
|
vblendps $0xc, %xmm3, %xmm2, %xmm5
|
|
blendps $0xc, %xmm4, %xmm3
|
|
blendps $0xc, %xmm2, %xmm4
|
|
movaps %xmm5, %xmm2
|
|
|
|
/*
|
|
xmm2 in[6,7,12,13]
|
|
xmm3 in[10,11,16,17]
|
|
xmm4 in[14,15,8,9]
|
|
*/
|
|
|
|
movaps (COS9_), %xmm15
|
|
movaps 16(COS9_), %xmm6
|
|
movaps 32(COS9_), %xmm7
|
|
movaps 48(COS9_), %xmm8
|
|
vmulps %xmm2, %xmm15, %xmm5
|
|
addps %xmm0, %xmm5
|
|
|
|
/*
|
|
xmm5 [ta33,tb33,ta66,tb66]
|
|
xmm6 COS9_[1,1,2,2]
|
|
xmm7 COS9_[5,5,8,8]
|
|
xmm8 COS9_[7,7,4,4]
|
|
xmm15 COS9_[3,3,6,6]
|
|
*/
|
|
|
|
vmulps %xmm1, %xmm6, %xmm9
|
|
vmulps %xmm3, %xmm7, %xmm12
|
|
vmulps %xmm4, %xmm8, %xmm13
|
|
addps %xmm5, %xmm9
|
|
addps %xmm13, %xmm12
|
|
addps %xmm9, %xmm12
|
|
|
|
vsubps %xmm3, %xmm1, %xmm13
|
|
vshufps $0xe0, %xmm2, %xmm0, %xmm14
|
|
vsubps %xmm14, %xmm0, %xmm14
|
|
subps %xmm4, %xmm13
|
|
mulps %xmm15, %xmm13
|
|
addps %xmm14, %xmm13
|
|
|
|
vmulps %xmm1, %xmm7, %xmm9
|
|
vmulps %xmm3, %xmm8, %xmm15
|
|
vmulps %xmm4, %xmm6, %xmm14
|
|
subps %xmm5, %xmm9
|
|
subps %xmm15, %xmm14
|
|
addps %xmm9, %xmm14
|
|
|
|
mulps %xmm1, %xmm8
|
|
mulps %xmm3, %xmm6
|
|
mulps %xmm4, %xmm7
|
|
subps %xmm5, %xmm8
|
|
subps %xmm7, %xmm6
|
|
vaddps %xmm6, %xmm8, %xmm15
|
|
|
|
movss 32(tfcos36_), %xmm5
|
|
subps %xmm1, %xmm0
|
|
subps %xmm2, %xmm4
|
|
addps %xmm3, %xmm0
|
|
addps %xmm4, %xmm0
|
|
shufps $0xaf, %xmm0, %xmm0
|
|
vmulss %xmm5, %xmm0, %xmm11
|
|
|
|
/*
|
|
xmm12 [1a-0,1b-0, 2a-0, 2b-0]
|
|
xmm13 [1a-1,1b-1, 2a-1, 2b-1]
|
|
xmm14 [1a-2,1b-2,-2a-2,-2b-2]
|
|
xmm15 [1a-3,1b-3,-2a-3,-2b-3]
|
|
*/
|
|
vunpckhps %xmm13, %xmm12, %xmm5
|
|
vunpcklps %xmm13, %xmm12, %xmm12
|
|
vunpckhps %xmm15, %xmm14, %xmm6
|
|
vunpcklps %xmm15, %xmm14, %xmm14
|
|
xorps dct36_avx_sign(%rip), %xmm6
|
|
|
|
/*
|
|
xmm12 [1a-0,1a-1,1b-0,1b-1]
|
|
xmm5 [2a-0,2a-1,2b-0,2b-1]
|
|
xmm14 [1a-2,1a-3,1b-2,1b-3]
|
|
xmm6 [2a-2,2a-3,2b-2,2b-3]
|
|
*/
|
|
|
|
vmovlhps %xmm14, %xmm12, %xmm0
|
|
movhlps %xmm12, %xmm14
|
|
vmovlhps %xmm6, %xmm5, %xmm1
|
|
vmovhlps %xmm5, %xmm6, %xmm15
|
|
|
|
/*
|
|
xmm0 tmp1a
|
|
xmm1 tmp2a
|
|
xmm14 tmp1b
|
|
xmm15 tmp2b
|
|
*/
|
|
|
|
movaps (tfcos36_), %xmm6
|
|
movaps 16(tfcos36_), %xmm7
|
|
vsubps %xmm14, %xmm15, %xmm10
|
|
addps %xmm14, %xmm15
|
|
vsubps %xmm0, %xmm1, %xmm14
|
|
addps %xmm1, %xmm0
|
|
vmulps %xmm6, %xmm15, %xmm1
|
|
mulps %xmm10, %xmm7
|
|
|
|
/*
|
|
%xmm0 tmp[0,1,2,3]
|
|
%xmm1 tmp[17,16,15,14]
|
|
%xmm14 tmp[8,7,6,5]
|
|
%xmm7 tmp[9,10,11,12]
|
|
%xmm11 tmp[13,-,4,-]
|
|
*/
|
|
|
|
movups 108(w), %xmm2
|
|
movups 92(w), %xmm3
|
|
shufps $0x1b, %xmm3, %xmm3
|
|
movups 36(w), %xmm4
|
|
movups 20(w), %xmm5
|
|
shufps $0x1b, %xmm5, %xmm5
|
|
vsubps %xmm1, %xmm0, %xmm6
|
|
addps %xmm1, %xmm0
|
|
mulps %xmm0, %xmm2
|
|
mulps %xmm3, %xmm0
|
|
mulps %xmm6, %xmm4
|
|
mulps %xmm5, %xmm6
|
|
movups 36(out1), %xmm1
|
|
movups 20(out1), %xmm3
|
|
shufps $0x1b, %xmm6, %xmm6
|
|
addps %xmm4, %xmm1
|
|
addps %xmm6, %xmm3
|
|
shufps $0x1b, %xmm0, %xmm0
|
|
movups %xmm2, 36(out2)
|
|
movups %xmm0, 20(out2)
|
|
movss %xmm1, 32*36(ts)
|
|
movss %xmm3, 32*20(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*44(ts)
|
|
movss %xmm4, 32*28(ts)
|
|
shufps $0xb1, %xmm1, %xmm1
|
|
shufps $0xb1, %xmm3, %xmm3
|
|
movss %xmm1, 32*40(ts)
|
|
movss %xmm3, 32*24(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*48(ts)
|
|
movss %xmm4, 32*32(ts)
|
|
|
|
movhlps %xmm11, %xmm0
|
|
movss 124(w), %xmm2
|
|
movss 88(w), %xmm3
|
|
movss 52(w), %xmm4
|
|
movss 16(w), %xmm5
|
|
movss %xmm0, %xmm6
|
|
addss %xmm11, %xmm0
|
|
subss %xmm11, %xmm6
|
|
mulss %xmm0, %xmm2
|
|
mulss %xmm3, %xmm0
|
|
mulss %xmm6, %xmm4
|
|
mulss %xmm5, %xmm6
|
|
addss 52(out1), %xmm4
|
|
addss 16(out1), %xmm6
|
|
movss %xmm2, 52(out2)
|
|
movss %xmm0, 16(out2)
|
|
movss %xmm4, 32*52(ts)
|
|
movss %xmm6, 32*16(ts)
|
|
|
|
movaps %xmm14, %xmm0
|
|
movaps %xmm7, %xmm1
|
|
MOVUAPS 128(w), %xmm2
|
|
movups 72(w), %xmm3
|
|
shufps $0x1b, %xmm2, %xmm2
|
|
movlps 56(w), %xmm4
|
|
movhps 64(w), %xmm4
|
|
MOVUAPS (w), %xmm5
|
|
shufps $0x1b, %xmm4, %xmm4
|
|
vsubps %xmm1, %xmm0, %xmm6
|
|
addps %xmm1, %xmm0
|
|
mulps %xmm0, %xmm2
|
|
mulps %xmm3, %xmm0
|
|
mulps %xmm6, %xmm4
|
|
mulps %xmm5, %xmm6
|
|
movlps 56(out1), %xmm1
|
|
movhps 64(out1), %xmm1
|
|
movups (out1), %xmm3
|
|
shufps $0x1b, %xmm4, %xmm4
|
|
addps %xmm6, %xmm3
|
|
addps %xmm4, %xmm1
|
|
shufps $0x1b, %xmm2, %xmm2
|
|
movups %xmm0, (out2)
|
|
movlps %xmm2, 56(out2)
|
|
movhps %xmm2, 64(out2)
|
|
movss %xmm1, 32*56(ts)
|
|
movss %xmm3, (ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*64(ts)
|
|
movss %xmm4, 32*8(ts)
|
|
shufps $0xb1, %xmm1, %xmm1
|
|
shufps $0xb1, %xmm3, %xmm3
|
|
movss %xmm1, 32*60(ts)
|
|
movss %xmm3, 32*4(ts)
|
|
movhlps %xmm1, %xmm2
|
|
movhlps %xmm3, %xmm4
|
|
movss %xmm2, 32*68(ts)
|
|
movss %xmm4, 32*12(ts)
|
|
|
|
#ifdef IS_MSABI
|
|
movaps (%rsp), %xmm6
|
|
movaps 16(%rsp), %xmm7
|
|
movaps 32(%rsp), %xmm8
|
|
movaps 48(%rsp), %xmm9
|
|
movaps 64(%rsp), %xmm10
|
|
movaps 80(%rsp), %xmm11
|
|
movaps 96(%rsp), %xmm12
|
|
movaps 112(%rsp), %xmm13
|
|
movaps 128(%rsp), %xmm14
|
|
movaps 144(%rsp), %xmm15
|
|
mov %rbp, %rsp
|
|
pop %rbp
|
|
#endif
|
|
ret
|
|
|
|
NONEXEC_STACK
|