Commit 60f7c47d authored by Holger Lubitz's avatar Holger Lubitz Committed by Fiona Glaser
Browse files

Refactor asm macros part 1: DCT

parent 63b84fa4
......@@ -23,7 +23,7 @@ endif
ifneq ($(AS),)
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-32.asm dct-32.asm
cpu-32.asm dct-32.asm x86util.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
......
......@@ -24,6 +24,7 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
......@@ -31,12 +32,6 @@ pw_32: times 8 dw 32
SECTION .text
%macro SUMSUB_BA 2
paddw %1, %2
paddw %2, %2
psubw %2, %1
%endmacro
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
......@@ -52,23 +47,6 @@ SECTION .text
SWAP %2, %3
%endmacro
%macro LOAD_DIFF_8P 4
movh %1, %3
movh %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro STORE_DIFF_8P 4
psraw %1, 6
movh %3, %2
punpcklbw %3, %4
paddsw %1, %3
packuswb %1, %1
movh %2, %1
%endmacro
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
......@@ -175,15 +153,15 @@ SECTION .text
INIT_MMX
ALIGN 16
load_diff_4x8_mmx:
LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
movq [r0], m0
LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
movq m0, [r0]
ret
......@@ -412,15 +390,15 @@ INIT_XMM
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue
.skip_prologue:
LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
SPILL r0, 0
LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
UNSPILL r0, 0
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
......@@ -446,14 +424,14 @@ global x264_add8x8_idct8_sse2 %+ .skip_prologue
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7
STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7
STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7
STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7
STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7
STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7
STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0]
STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1]
STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2]
STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3]
STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4]
STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5]
UNSPILL_SHUFFLE r1, 0,1, 6,7
STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7
STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7
STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6]
STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7]
ret
......@@ -23,6 +23,7 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pw_32: times 8 dw 32
......@@ -31,20 +32,6 @@ SECTION .text
INIT_XMM
%macro LOAD_DIFF_8P 5
movq %1, %4
punpcklbw %1, %3
movq %2, %5
punpcklbw %2, %3
psubw %1, %2
%endmacro
%macro SUMSUB_BA 2
paddw %1, %2
paddw %2, %2
psubw %2, %1
%endmacro
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
......@@ -69,15 +56,6 @@ INIT_XMM
SWAP %4, %7
%endmacro
%macro STORE_DIFF_8P 4
psraw %1, 6
movq %2, %4
punpcklbw %2, %3
paddsw %1, %2
packuswb %1, %1
movq %4, %1
%endmacro
SECTION .text
%macro DCT8_1D 10
......@@ -136,14 +114,14 @@ SECTION .text
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2
LOAD_DIFF_8P m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_8P m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_8P m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_8P m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF_8P m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF_8P m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
LOAD_DIFF_8P m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF_8P m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
DCT8_1D 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
......@@ -232,14 +210,14 @@ cglobal x264_add8x8_idct8_sse2
IDCT8_1D 0,1,2,3,4,5,6,7,8,9
pxor m9, m9
STORE_DIFF_8P m0, m8, m9, [r0+0*FDEC_STRIDE]
STORE_DIFF_8P m1, m8, m9, [r0+1*FDEC_STRIDE]
STORE_DIFF_8P m2, m8, m9, [r0+2*FDEC_STRIDE]
STORE_DIFF_8P m3, m8, m9, [r0+3*FDEC_STRIDE]
STORE_DIFF_8P m4, m8, m9, [r0+4*FDEC_STRIDE]
STORE_DIFF_8P m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF_8P m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF_8P m7, m8, m9, [r0+7*FDEC_STRIDE]
STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]
STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE]
STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE]
STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE]
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
ret
......@@ -23,6 +23,7 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
......@@ -31,46 +32,6 @@ pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION .text
%macro LOAD_DIFF_4P 5
movh %1, %4
punpcklbw %1, %3
movh %2, %5
punpcklbw %2, %3
psubw %1, %2
%endmacro
%macro SUMSUB_BA 2
paddw %1, %2
paddw %2, %2
psubw %2, %1
%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro SUMSUB2_AB 3
mova %3, %1
paddw %1, %1
paddw %1, %2
psubw %3, %2
psubw %3, %2
%endmacro
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
psraw %2, 1
psraw %4, 1
paddw %1, %2
psubw %4, %3
%endmacro
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
......@@ -95,15 +56,6 @@ SECTION .text
SBUTTERFLY qdq, %3, %4, %5
%endmacro
%macro STORE_DIFF_4P 4
psraw %1, 6
movh %2, %4
punpcklbw %2, %3
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endmacro
%macro HADAMARD4_1D 4
SUMSUB_BADC m%2, m%1, m%4, m%3
SUMSUB_BADC m%4, m%2, m%3, m%1
......@@ -173,10 +125,10 @@ cglobal x264_idct4x4dc_mmx, 1,1
cglobal x264_sub4x4_dct_mmx, 3,3
.skip_prologue:
%macro SUB_DCT4 1
LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
DCT4_1D 0,1,2,3,4
TRANSPOSE%1 0,1,2,3,4
DCT4_1D 0,1,2,3,4
......@@ -203,10 +155,10 @@ cglobal x264_add4x4_idct_mmx, 2,2,1
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,4,5
pxor m7, m7
STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE]
STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE]
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
%endmacro
ADD_IDCT4 4x4W
RET
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment