472 lines
9.7 KiB
ArmAsm
472 lines
9.7 KiB
ArmAsm
/*
|
|
dct64_sse: MMX/SSE optimized dct64
|
|
|
|
copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
initially written by Taihei Monma
|
|
*/
|
|
|
|
#include "mangle.h"
|
|
|
|
#define ARG(n) (8+n*4)(%ebp)
|
|
#define TEMP(n) (4+n*16)(%esp)
|
|
#define TEMP_BYTE(n) (4+n)(%esp)
|
|
|
|
/*
|
|
void dct64_sse(short *out0, short *out1, real *samples);
|
|
*/
|
|
|
|
#if !defined (__APPLE__) && !defined (__OS2__)
|
|
.section .rodata
|
|
#else
|
|
.data
|
|
#endif
|
|
ALIGN16
|
|
pnpn:
|
|
.long 0
|
|
.long -2147483648
|
|
.long 0
|
|
.long -2147483648
|
|
ALIGN16
|
|
mask:
|
|
.long -1
|
|
.long -1
|
|
.long -1
|
|
.long 0
|
|
|
|
.text
|
|
ALIGN16
|
|
.globl ASM_NAME(INT123_dct64_sse)
|
|
ASM_NAME(INT123_dct64_sse):
|
|
pushl %ebp
|
|
movl %esp, %ebp
|
|
|
|
andl $-16, %esp /* align the stack at 16 bytes */
|
|
subl $128, %esp /* reserve space for temporal store */
|
|
pushl %ebx
|
|
|
|
GET_GOT
|
|
|
|
movl ARG(2), %eax
|
|
|
|
MOVUAPS (%eax), %xmm7
|
|
MOVUAPS 16(%eax), %xmm6
|
|
MOVUAPS 112(%eax), %xmm0
|
|
MOVUAPS 96(%eax), %xmm1
|
|
shufps $0x1b, %xmm0, %xmm0
|
|
shufps $0x1b, %xmm1, %xmm1
|
|
movaps %xmm7, %xmm4
|
|
movaps %xmm6, %xmm5
|
|
addps %xmm0, %xmm4
|
|
addps %xmm1, %xmm5
|
|
subps %xmm0, %xmm7
|
|
subps %xmm1, %xmm6
|
|
movaps %xmm4, TEMP(0)
|
|
movaps %xmm5, TEMP(1)
|
|
|
|
MOVUAPS 32(%eax), %xmm2
|
|
MOVUAPS 48(%eax), %xmm3
|
|
MOVUAPS 80(%eax), %xmm0
|
|
MOVUAPS 64(%eax), %xmm1
|
|
shufps $0x1b, %xmm0, %xmm0
|
|
shufps $0x1b, %xmm1, %xmm1
|
|
movaps %xmm2, %xmm5
|
|
movaps %xmm3, %xmm4
|
|
addps %xmm0, %xmm2
|
|
addps %xmm1, %xmm3
|
|
subps %xmm0, %xmm5
|
|
subps %xmm1, %xmm4
|
|
|
|
#ifdef PIC_GLOBAL_PTR
|
|
mov GLOBAL_VAR_PTR(INT123_costab_mmxsse), %ecx
|
|
#else
|
|
lea GLOBAL_VAR(INT123_costab_mmxsse), %ecx
|
|
#endif
|
|
mulps (%ecx), %xmm7
|
|
mulps 16(%ecx), %xmm6
|
|
mulps 32(%ecx), %xmm5
|
|
mulps 48(%ecx), %xmm4
|
|
|
|
shufps $0x1b, %xmm2, %xmm2
|
|
shufps $0x1b, %xmm3, %xmm3
|
|
shufps $0x1b, %xmm4, %xmm4
|
|
shufps $0x1b, %xmm5, %xmm5
|
|
movaps TEMP(0), %xmm0
|
|
movaps TEMP(1), %xmm1
|
|
subps %xmm3, %xmm0
|
|
subps %xmm2, %xmm1
|
|
addps TEMP(0), %xmm3
|
|
addps TEMP(1), %xmm2
|
|
movaps %xmm3, TEMP(0)
|
|
movaps %xmm2, TEMP(1)
|
|
movaps %xmm6, %xmm2
|
|
movaps %xmm7, %xmm3
|
|
subps %xmm5, %xmm6
|
|
subps %xmm4, %xmm7
|
|
addps %xmm3, %xmm4
|
|
addps %xmm2, %xmm5
|
|
mulps 64(%ecx), %xmm0
|
|
mulps 80(%ecx), %xmm1
|
|
mulps 80(%ecx), %xmm6
|
|
mulps 64(%ecx), %xmm7
|
|
|
|
movaps TEMP(0), %xmm2
|
|
movaps TEMP(1), %xmm3
|
|
shufps $0x1b, %xmm3, %xmm3
|
|
shufps $0x1b, %xmm5, %xmm5
|
|
shufps $0x1b, %xmm1, %xmm1
|
|
shufps $0x1b, %xmm6, %xmm6
|
|
movaps %xmm0, TEMP(1)
|
|
subps %xmm3, %xmm2
|
|
subps %xmm1, %xmm0
|
|
addps TEMP(0), %xmm3
|
|
addps TEMP(1), %xmm1
|
|
movaps %xmm3, TEMP(0)
|
|
movaps %xmm1, TEMP(2)
|
|
movaps %xmm5, %xmm1
|
|
movaps %xmm4, %xmm5
|
|
movaps %xmm7, %xmm3
|
|
subps %xmm1, %xmm5
|
|
subps %xmm6, %xmm7
|
|
addps %xmm1, %xmm4
|
|
addps %xmm3, %xmm6
|
|
mulps 96(%ecx), %xmm2
|
|
mulps 96(%ecx), %xmm0
|
|
mulps 96(%ecx), %xmm5
|
|
mulps 96(%ecx), %xmm7
|
|
movaps %xmm2, TEMP(1)
|
|
movaps %xmm0, TEMP(3)
|
|
|
|
movaps %xmm4, %xmm2
|
|
movaps %xmm5, %xmm3
|
|
shufps $0x44, %xmm6, %xmm2
|
|
shufps $0xbb, %xmm7, %xmm5
|
|
shufps $0xbb, %xmm6, %xmm4
|
|
shufps $0x44, %xmm7, %xmm3
|
|
movaps %xmm2, %xmm6
|
|
movaps %xmm3, %xmm7
|
|
subps %xmm4, %xmm2
|
|
subps %xmm5, %xmm3
|
|
addps %xmm6, %xmm4
|
|
addps %xmm7, %xmm5
|
|
movaps 112(%ecx), %xmm0
|
|
movlhps %xmm0, %xmm0
|
|
mulps %xmm0, %xmm2
|
|
mulps %xmm0, %xmm3
|
|
movaps %xmm0, TEMP(4)
|
|
movaps %xmm4, %xmm6
|
|
movaps %xmm5, %xmm7
|
|
shufps $0x14, %xmm2, %xmm4
|
|
shufps $0xbe, %xmm2, %xmm6
|
|
shufps $0x14, %xmm3, %xmm5
|
|
shufps $0xbe, %xmm3, %xmm7
|
|
movaps %xmm5, TEMP(5)
|
|
movaps %xmm7, TEMP(7)
|
|
|
|
movaps TEMP(0), %xmm0
|
|
movaps TEMP(1), %xmm1
|
|
movaps %xmm0, %xmm2
|
|
movaps %xmm1, %xmm3
|
|
shufps $0x44, TEMP(2), %xmm2
|
|
shufps $0xbb, TEMP(3), %xmm1
|
|
shufps $0xbb, TEMP(2), %xmm0
|
|
shufps $0x44, TEMP(3), %xmm3
|
|
movaps %xmm2, %xmm5
|
|
movaps %xmm3, %xmm7
|
|
subps %xmm0, %xmm2
|
|
subps %xmm1, %xmm3
|
|
addps %xmm5, %xmm0
|
|
addps %xmm7, %xmm1
|
|
mulps TEMP(4), %xmm2
|
|
mulps TEMP(4), %xmm3
|
|
movaps %xmm0, %xmm5
|
|
movaps %xmm1, %xmm7
|
|
shufps $0x14, %xmm2, %xmm0
|
|
shufps $0xbe, %xmm2, %xmm5
|
|
shufps $0x14, %xmm3, %xmm1
|
|
shufps $0xbe, %xmm3, %xmm7
|
|
|
|
movaps %xmm0, TEMP(0)
|
|
movaps %xmm1, TEMP(1)
|
|
movaps %xmm5, TEMP(2)
|
|
movaps %xmm7, TEMP(3)
|
|
|
|
movss 120(%ecx), %xmm5
|
|
shufps $0x00, %xmm5, %xmm5
|
|
xorps LOCAL_VAR(pnpn), %xmm5
|
|
|
|
movaps %xmm4, %xmm0
|
|
movaps %xmm6, %xmm1
|
|
unpcklps TEMP(5), %xmm4
|
|
unpckhps TEMP(5), %xmm0
|
|
unpcklps TEMP(7), %xmm6
|
|
unpckhps TEMP(7), %xmm1
|
|
movaps %xmm4, %xmm2
|
|
movaps %xmm6, %xmm3
|
|
unpcklps %xmm0, %xmm4
|
|
unpckhps %xmm0, %xmm2
|
|
unpcklps %xmm1, %xmm6
|
|
unpckhps %xmm1, %xmm3
|
|
movaps %xmm4, %xmm0
|
|
movaps %xmm6, %xmm1
|
|
subps %xmm2, %xmm0
|
|
subps %xmm3, %xmm1
|
|
addps %xmm2, %xmm4
|
|
addps %xmm3, %xmm6
|
|
mulps %xmm5, %xmm0
|
|
mulps %xmm5, %xmm1
|
|
movaps %xmm5, TEMP(5)
|
|
movaps %xmm4, %xmm5
|
|
movaps %xmm6, %xmm7
|
|
unpcklps %xmm0, %xmm4
|
|
unpckhps %xmm0, %xmm5
|
|
unpcklps %xmm1, %xmm6
|
|
unpckhps %xmm1, %xmm7
|
|
|
|
movaps TEMP(0), %xmm0
|
|
movaps TEMP(2), %xmm2
|
|
movaps %xmm4, TEMP(4)
|
|
movaps %xmm6, TEMP(6)
|
|
|
|
movaps %xmm0, %xmm4
|
|
movaps %xmm2, %xmm6
|
|
unpcklps TEMP(1), %xmm0
|
|
unpckhps TEMP(1), %xmm4
|
|
unpcklps TEMP(3), %xmm2
|
|
unpckhps TEMP(3), %xmm6
|
|
movaps %xmm0, %xmm1
|
|
movaps %xmm2, %xmm3
|
|
unpcklps %xmm4, %xmm0
|
|
unpckhps %xmm4, %xmm1
|
|
unpcklps %xmm6, %xmm2
|
|
unpckhps %xmm6, %xmm3
|
|
movaps %xmm0, %xmm4
|
|
movaps %xmm2, %xmm6
|
|
subps %xmm1, %xmm4
|
|
subps %xmm3, %xmm6
|
|
addps %xmm1, %xmm0
|
|
addps %xmm3, %xmm2
|
|
mulps TEMP(5), %xmm4
|
|
mulps TEMP(5), %xmm6
|
|
movaps %xmm0, %xmm1
|
|
movaps %xmm2, %xmm3
|
|
unpcklps %xmm4, %xmm0
|
|
unpckhps %xmm4, %xmm1
|
|
unpcklps %xmm6, %xmm2
|
|
unpckhps %xmm6, %xmm3
|
|
|
|
movaps %xmm0, TEMP(0)
|
|
movaps %xmm1, TEMP(1)
|
|
movaps %xmm2, TEMP(2)
|
|
movaps %xmm3, TEMP(3)
|
|
movaps %xmm5, TEMP(5)
|
|
movaps %xmm7, TEMP(7)
|
|
|
|
movss TEMP_BYTE(12), %xmm0
|
|
movss TEMP_BYTE(28), %xmm1
|
|
movss TEMP_BYTE(44), %xmm2
|
|
movss TEMP_BYTE(60), %xmm3
|
|
addss TEMP_BYTE(8), %xmm0
|
|
addss TEMP_BYTE(24), %xmm1
|
|
addss TEMP_BYTE(40), %xmm2
|
|
addss TEMP_BYTE(56), %xmm3
|
|
movss %xmm0, TEMP_BYTE(8)
|
|
movss %xmm1, TEMP_BYTE(24)
|
|
movss %xmm2, TEMP_BYTE(40)
|
|
movss %xmm3, TEMP_BYTE(56)
|
|
movss TEMP_BYTE(76), %xmm0
|
|
movss TEMP_BYTE(92), %xmm1
|
|
movss TEMP_BYTE(108), %xmm2
|
|
movss TEMP_BYTE(124), %xmm3
|
|
addss TEMP_BYTE(72), %xmm0
|
|
addss TEMP_BYTE(88), %xmm1
|
|
addss TEMP_BYTE(104), %xmm2
|
|
addss TEMP_BYTE(120), %xmm3
|
|
movss %xmm0, TEMP_BYTE(72)
|
|
movss %xmm1, TEMP_BYTE(88)
|
|
movss %xmm2, TEMP_BYTE(104)
|
|
movss %xmm3, TEMP_BYTE(120)
|
|
|
|
movaps TEMP_BYTE(16), %xmm1
|
|
movaps TEMP_BYTE(48), %xmm3
|
|
movaps TEMP_BYTE(80), %xmm5
|
|
movaps TEMP_BYTE(112), %xmm7
|
|
movaps %xmm1, %xmm0
|
|
movaps %xmm3, %xmm2
|
|
movaps %xmm5, %xmm4
|
|
movaps %xmm7, %xmm6
|
|
shufps $0x1e, %xmm0, %xmm0
|
|
shufps $0x1e, %xmm2, %xmm2
|
|
shufps $0x1e, %xmm4, %xmm4
|
|
shufps $0x1e, %xmm6, %xmm6
|
|
leal LOCAL_VAR(mask), %ecx
|
|
andps (%ecx), %xmm0
|
|
andps (%ecx), %xmm2
|
|
andps (%ecx), %xmm4
|
|
andps (%ecx), %xmm6
|
|
addps %xmm0, %xmm1
|
|
addps %xmm2, %xmm3
|
|
addps %xmm4, %xmm5
|
|
addps %xmm6, %xmm7
|
|
|
|
movaps TEMP_BYTE(32), %xmm2
|
|
movaps TEMP_BYTE(96), %xmm6
|
|
movaps %xmm2, %xmm0
|
|
movaps %xmm6, %xmm4
|
|
shufps $0x1e, %xmm0, %xmm0
|
|
shufps $0x1e, %xmm4, %xmm4
|
|
andps (%ecx), %xmm0
|
|
andps (%ecx), %xmm4
|
|
addps %xmm3, %xmm2
|
|
addps %xmm0, %xmm3
|
|
addps %xmm7, %xmm6
|
|
addps %xmm4, %xmm7
|
|
|
|
movaps TEMP_BYTE(0), %xmm0
|
|
movaps TEMP_BYTE(64), %xmm4
|
|
|
|
cvtps2pi %xmm0, %mm0
|
|
cvtps2pi %xmm1, %mm1
|
|
movhlps %xmm0, %xmm0
|
|
movhlps %xmm1, %xmm1
|
|
cvtps2pi %xmm0, %mm2
|
|
cvtps2pi %xmm1, %mm3
|
|
packssdw %mm2, %mm0
|
|
packssdw %mm3, %mm1
|
|
|
|
cvtps2pi %xmm2, %mm2
|
|
cvtps2pi %xmm3, %mm3
|
|
movhlps %xmm2, %xmm2
|
|
movhlps %xmm3, %xmm3
|
|
cvtps2pi %xmm2, %mm4
|
|
cvtps2pi %xmm3, %mm5
|
|
packssdw %mm4, %mm2
|
|
packssdw %mm5, %mm3
|
|
|
|
mov %ecx, TEMP_BYTE(0)
|
|
movl ARG(0), %ecx
|
|
movl ARG(1), %ebx
|
|
|
|
movd %mm0, %eax
|
|
movd %mm1, %edx
|
|
movw %ax, 512(%ecx)
|
|
movw %dx, 384(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, (%ecx)
|
|
movw %ax, (%ebx)
|
|
movw %dx, 128(%ebx)
|
|
|
|
movd %mm2, %eax
|
|
movd %mm3, %edx
|
|
movw %ax, 448(%ecx)
|
|
movw %dx, 320(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 64(%ebx)
|
|
movw %dx, 192(%ebx)
|
|
|
|
psrlq $32, %mm0
|
|
psrlq $32, %mm1
|
|
movd %mm0, %eax
|
|
movd %mm1, %edx
|
|
movw %ax, 256(%ecx)
|
|
movw %dx, 128(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 256(%ebx)
|
|
movw %dx, 384(%ebx)
|
|
|
|
psrlq $32, %mm2
|
|
psrlq $32, %mm3
|
|
movd %mm2, %eax
|
|
movd %mm3, %edx
|
|
movw %ax, 192(%ecx)
|
|
movw %dx, 64(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 320(%ebx)
|
|
movw %dx, 448(%ebx)
|
|
|
|
mov TEMP_BYTE(0), %eax
|
|
movaps %xmm4, %xmm0
|
|
shufps $0x1e, %xmm0, %xmm0
|
|
movaps %xmm5, %xmm1
|
|
andps (%eax), %xmm0
|
|
|
|
addps %xmm6, %xmm4
|
|
addps %xmm7, %xmm5
|
|
addps %xmm1, %xmm6
|
|
addps %xmm0, %xmm7
|
|
|
|
cvtps2pi %xmm4, %mm0
|
|
cvtps2pi %xmm5, %mm1
|
|
movhlps %xmm4, %xmm4
|
|
movhlps %xmm5, %xmm5
|
|
cvtps2pi %xmm4, %mm2
|
|
cvtps2pi %xmm5, %mm3
|
|
packssdw %mm2, %mm0
|
|
packssdw %mm3, %mm1
|
|
|
|
cvtps2pi %xmm6, %mm2
|
|
cvtps2pi %xmm7, %mm3
|
|
movhlps %xmm6, %xmm6
|
|
movhlps %xmm7, %xmm7
|
|
cvtps2pi %xmm6, %mm4
|
|
cvtps2pi %xmm7, %mm5
|
|
packssdw %mm4, %mm2
|
|
packssdw %mm5, %mm3
|
|
|
|
movd %mm0, %eax
|
|
movd %mm2, %edx
|
|
movw %ax, 480(%ecx)
|
|
movw %dx, 416(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 32(%ebx)
|
|
movw %dx, 96(%ebx)
|
|
|
|
psrlq $32, %mm0
|
|
psrlq $32, %mm2
|
|
movd %mm0, %eax
|
|
movd %mm2, %edx
|
|
movw %ax, 224(%ecx)
|
|
movw %dx, 160(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 288(%ebx)
|
|
movw %dx, 352(%ebx)
|
|
|
|
movd %mm1, %eax
|
|
movd %mm3, %edx
|
|
movw %ax, 352(%ecx)
|
|
movw %dx, 288(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 160(%ebx)
|
|
movw %dx, 224(%ebx)
|
|
|
|
psrlq $32, %mm1
|
|
psrlq $32, %mm3
|
|
movd %mm1, %eax
|
|
movd %mm3, %edx
|
|
movw %ax, 96(%ecx)
|
|
movw %dx, 32(%ecx)
|
|
shrl $16, %eax
|
|
shrl $16, %edx
|
|
movw %ax, 416(%ebx)
|
|
movw %dx, 480(%ebx)
|
|
|
|
popl %ebx
|
|
movl %ebp, %esp
|
|
popl %ebp
|
|
ret
|
|
|
|
#if defined(PIC) && defined(__APPLE__)
|
|
.section __IMPORT,__pointers,non_lazy_symbol_pointers
|
|
L_INT123_costab_mmxsse:
|
|
.indirect_symbol ASM_NAME(INT123_costab_mmxsse)
|
|
.long 0
|
|
#endif
|
|
|
|
NONEXEC_STACK
|