Commit 29899d84 authored by Loren Merritt's avatar Loren Merritt

more mmx/xmm macros (mova, movu, movh)

parent 937b7925
......@@ -138,8 +138,8 @@ SECTION .text
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 5
movq %5, %2
movq %4, %1
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
por %4, %5
......@@ -149,8 +149,8 @@ SECTION .text
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
movq %5, %2
movq %4, %1
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
psubusb %5, %3
......@@ -190,7 +190,7 @@ SECTION .text
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
movq m5, m1
mova m5, m1
pxor m5, m2 ; p0^q0
pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
pcmpeqb m4, m4
......@@ -201,7 +201,7 @@ SECTION .text
pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
paddusb m3, m4 ; d+128+33
movq m6, [pb_a1 GLOBAL]
mova m6, [pb_a1 GLOBAL]
psubusb m6, m3
psubusb m3, [pb_a1 GLOBAL]
pminub m6, m7
......@@ -217,18 +217,18 @@ SECTION .text
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6
movq %6, m1
mova %6, m1
pavgb %6, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
movq %6, %1
mova %6, %1
psubusb %6, %5
paddusb %5, %1
pmaxub %2, %6
pminub %2, %5
movq %4, %2
mova %4, %2
%endmacro
;-----------------------------------------------------------------------------
......@@ -244,10 +244,10 @@ cglobal x264_deblock_v_luma_sse2
dec r3d ; beta-1
add r4, r0 ; pix-3*stride
movdqa m0, [r4+r1] ; p1
movdqa m1, [r4+2*r1] ; p0
movdqa m2, [r0] ; q0
movdqa m3, [r0+r1] ; q1
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LOAD_MASK r2d, r3d
punpcklbw m8, m8
......@@ -260,7 +260,7 @@ cglobal x264_deblock_v_luma_sse2
movdqa m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
movdqa m7, m8
mova m7, m8
psubb m7, m6
pand m6, m8
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
......@@ -270,12 +270,12 @@ cglobal x264_deblock_v_luma_sse2
pand m6, m9
pand m8, m6
psubb m7, m6
movdqa m3, [r0+r1]
mova m3, [r0+r1]
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
DEBLOCK_P0_Q0
movdqa [r4+2*r1], m1
movdqa [r0], m2
mova [r4+2*r1], m1
mova [r0], m2
ret
;-----------------------------------------------------------------------------
......@@ -338,10 +338,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5,1
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
movq m0, [r4+r1] ; p1
movq m1, [r4+2*r1] ; p0
movq m2, [r0] ; q0
movq m3, [r0+r1] ; q1
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
mov r3, r4m
......@@ -356,34 +356,34 @@ cglobal x264_deblock_%2_luma_%1, 5,5,1
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
movq [esp+%3], m4 ; tc
mova [esp+%3], m4 ; tc
pcmpeqb m3, m3
pcmpgtb m4, m3
pand m4, m7
movq [esp], m4 ; mask
mova [esp], m4 ; mask
movq m3, [r4] ; p2
mova m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4
pand m4, [esp+%3] ; tc
movq m7, m4
mova m7, m4
psubb m7, m6
pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
movq m4, [r0+2*r1] ; q2
mova m4, [r0+2*r1] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
movq m5, [esp] ; mask
mova m5, [esp] ; mask
pand m6, m5
movq m5, [esp+%3] ; tc
mova m5, [esp+%3] ; tc
pand m5, m6
psubb m7, m6
movq m3, [r0+r1]
mova m3, [r0+r1]
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
DEBLOCK_P0_Q0
movq [r4+2*r1], m1
movq [r0], m2
mova [r4+2*r1], m1
mova [r0], m2
%if %3 == 16
mov esp, r2
......
......@@ -75,7 +75,7 @@ SECTION .text
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
movq m0, %1 ; load dct coeffs
mova m0, %1 ; load dct coeffs
pxor m1, m1
pcmpgtw m1, m0 ; sign(coeff)
pxor m0, m1
......@@ -84,16 +84,16 @@ SECTION .text
pmulhuw m0, %2 ; divide
pxor m0, m1 ; restore sign
psubw m0, m1
movq %1, m0 ; store
mova %1, m0 ; store
%endmacro
%macro QUANT_SSSE3 3
movq m1, %1 ; load dct coeffs
mova m1, %1 ; load dct coeffs
pabsw m0, m1
paddusw m0, %3 ; round
pmulhuw m0, %2 ; divide
psignw m0, m1 ; restore sign
movq %1, m0 ; store
mova %1, m0 ; store
%endmacro
INIT_MMX
......@@ -162,11 +162,11 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m5 i_qbits
movq m0, %2
mova m0, %2
packssdw m0, %3
pmullw m0, %1
psllw m0, m5
movq %1, m0
mova %1, m0
%endmacro
%macro DEQUANT32_R 3
......@@ -176,8 +176,8 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
;;; m6 f
;;; m7 0
movq m0, %1
movq m1, m0
mova m0, %1
mova m1, m0
punpcklwd m0, m7
punpckhwd m1, m7
pmaddwd m0, %2
......@@ -187,7 +187,7 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
psrad m0, m5
psrad m1, m5
packssdw m0, m1
movq %1, m0
mova %1, m0
%endmacro
%macro DEQUANT_LOOP 3
......@@ -207,17 +207,17 @@ QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
%endmacro
%macro DEQUANT16_FLAT 2-8
movq m0, %1
mova m0, %1
%assign i %0-2
%rep %0-1
%if i
movq m %+ i, [r0+%2]
mova m %+ i, [r0+%2]
pmullw m %+ i, m0
%else
pmullw m0, [r0+%2]
%endif
psllw m %+ i, m7
movq [r0+%2], m %+ i
mova [r0+%2], m %+ i
%assign i i-1
%rotate 1
%endrep
......@@ -268,7 +268,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3
neg t0d
movd m5, t0d
picgetgot t0d
movq m6, [pd_1 GLOBAL]
mova m6, [pd_1 GLOBAL]
pxor m7, m7
pslld m6, m5
psrld m6, 1
......
......@@ -331,7 +331,9 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits
%assign FDEC_STRIDE 32
%macro INIT_MMX 0
%undef movq
%define mova movq
%define movu movq
%define movh movd
%define m0 mm0
%define m1 mm1
%define m2 mm2
......@@ -345,7 +347,9 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits
%endmacro
%macro INIT_XMM 0
%define movq movdqa
%define mova movdqa
%define movu movdqu
%define movh movq
%define m0 xmm0
%define m1 xmm1
%define m2 xmm2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment