Commit 1921c682 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

asm cosmetics part 2

These changes were split out of the cpuflags commit because they change the output executable.
parent f85be1cd
......@@ -359,7 +359,7 @@ INIT_MMX
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11*(mmsize/16)
cglobal %1, 3,3,11
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
......@@ -398,9 +398,9 @@ cglobal %1, 3,3,11*(mmsize/16)
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
%ifdef HIGH_BIT_DEPTH
cglobal %1, 2,2,6*(mmsize/16)
cglobal %1, 2,2,6
%else
cglobal %1, 2,2,11*(mmsize/16)
cglobal %1, 2,2,11
pxor m7, m7
%endif
%if mmsize==16
......@@ -661,6 +661,7 @@ cglobal add16x16_idct_dc_mmx, 2,3
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
INIT_XMM
cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
......@@ -939,7 +940,7 @@ SCAN_8x8
; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8_FRAME 5
cglobal zigzag_scan_8x8_frame, 2,2,8*(mmsize/16)
cglobal zigzag_scan_8x8_frame, 2,2,8
mova m0, [r1]
mova m1, [r1+ 8*SIZEOF_DCTCOEF]
movu m2, [r1+14*SIZEOF_DCTCOEF]
......@@ -1149,7 +1150,7 @@ cglobal zigzag_scan_4x4_field_mmx2, 2,3
; 54 55 58 59 60 61 62 63
%undef SCAN_8x8
%macro SCAN_8x8 5
cglobal zigzag_scan_8x8_field, 2,3,8*(mmsize/16)
cglobal zigzag_scan_8x8_field, 2,3,8
mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
......@@ -1330,7 +1331,7 @@ ZIGZAG_SUB_4x4 ac, field
%endmacro
%macro ZIGZAG_8x8_CAVLC 1
cglobal zigzag_interleave_8x8_cavlc, 3,3,8*(mmsize/16)
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
INTERLEAVE 0, %1
INTERLEAVE 8, %1
INTERLEAVE 16, %1
......
......@@ -162,7 +162,7 @@ cextern pw_pixel_max
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma, 5,5,8*(mmsize/16)
cglobal deblock_v_luma, 5,5,8
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
......@@ -216,7 +216,7 @@ cglobal deblock_v_luma, 5,5,8*(mmsize/16)
ADD rsp, pad
RET
cglobal deblock_h_luma, 5,6,8*(mmsize/16)
cglobal deblock_h_luma, 5,6,8
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
......@@ -724,7 +724,7 @@ DEBLOCK_LUMA_INTRA_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16)
cglobal deblock_v_luma_intra, 4,7,8
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
......@@ -750,7 +750,7 @@ cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16)
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,8*(mmsize/16)
cglobal deblock_h_luma_intra, 4,7,8
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
......@@ -1673,11 +1673,22 @@ DEBLOCK_LUMA_INTRA v8
mova [r0+2*r1], m2
%endmacro
%macro DEBLOCK_CHROMA 1
%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
LOAD_TC m6, r4
pmaxsw m6, m4
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
ret
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 7,7,8*(mmsize/16)
cglobal deblock_v_chroma, 7,7,8
FIX_STRIDES r1
mov r5, r0
sub r0, r1
......@@ -1685,7 +1696,7 @@ cglobal deblock_v_chroma, 7,7,8*(mmsize/16)
mov r6, 32/mmsize
.loop:
CHROMA_V_LOAD r5
call deblock_inter_body_%1
call deblock_inter_body
CHROMA_V_STORE
add r0, mmsize
add r5, mmsize
......@@ -1697,7 +1708,7 @@ cglobal deblock_v_chroma, 7,7,8*(mmsize/16)
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8*(mmsize/16)
cglobal deblock_h_chroma, 5,7,8
add r1, r1
mov r5, 32/mmsize
%if mmsize == 16
......@@ -1705,7 +1716,7 @@ cglobal deblock_h_chroma, 5,7,8*(mmsize/16)
%endif
.loop:
CHROMA_H_LOAD r6
call deblock_inter_body_%1
call deblock_inter_body
CHROMA_H_STORE r6
lea r0, [r0+r1*(mmsize/4)]
add r4, mmsize/8
......@@ -1713,21 +1724,18 @@ cglobal deblock_h_chroma, 5,7,8*(mmsize/16)
jg .loop
REP_RET
deblock_inter_body_%1:
cglobal deblock_intra_body
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
LOAD_TC m6, r4
pmaxsw m6, m4
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
ret
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16)
cglobal deblock_v_chroma_intra, 4,6,8
add r1, r1
mov r5, 32/mmsize
movd m5, r3
......@@ -1737,7 +1745,7 @@ cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16)
SPLATW m5, m5
.loop:
CHROMA_V_LOAD r4
call deblock_intra_body_%1
call deblock_intra_body
CHROMA_V_STORE
add r0, mmsize
add r4, mmsize
......@@ -1748,7 +1756,7 @@ cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16)
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16)
cglobal deblock_h_chroma_intra, 4,6,8
add r1, r1
mov r4, 32/mmsize
%if mmsize == 16
......@@ -1756,29 +1764,22 @@ cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16)
%endif
.loop:
CHROMA_H_LOAD r5
call deblock_intra_body_%1
call deblock_intra_body
CHROMA_H_STORE r5
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
REP_RET
deblock_intra_body_%1:
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
ret
%endmacro
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA mmx2
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA sse2
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA avx
DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
......@@ -1834,7 +1835,16 @@ DEBLOCK_CHROMA avx
%define t5 r5
%define t6 r6
%macro DEBLOCK_CHROMA 1
%macro DEBLOCK_CHROMA 0
cglobal chroma_inter_body
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
ret
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
......@@ -1844,7 +1854,7 @@ cglobal deblock_v_chroma, 5,6,8
mova m1, [t5+r1]
mova m2, [r0]
mova m3, [r0+r1]
call chroma_inter_body_%1
call chroma_inter_body
mova [t5+r1], m1
mova [r0], m2
CHROMA_V_LOOP 1
......@@ -1856,30 +1866,19 @@ cglobal deblock_v_chroma, 5,6,8
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_inter_body_%1
call chroma_inter_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 1
RET
ALIGN 16
RESET_MM_PERMUTATION
chroma_inter_body_%1:
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
ret
%endmacro ; DEBLOCK_CHROMA
INIT_XMM sse2
DEBLOCK_CHROMA sse2
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA avx
DEBLOCK_CHROMA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA mmx2
DEBLOCK_CHROMA
%endif
......@@ -1896,7 +1895,21 @@ DEBLOCK_CHROMA mmx2
%define t5 r4
%define t6 r5
%macro DEBLOCK_CHROMA_INTRA 1
%macro DEBLOCK_CHROMA_INTRA 0
cglobal chroma_intra_body
LOAD_MASK r2d, r3d
mova m5, m1
mova m6, m2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
psubb m2, m6
pand m1, m7
pand m2, m7
paddb m1, m5
paddb m2, m6
ret
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
......@@ -1906,7 +1919,7 @@ cglobal deblock_v_chroma_intra, 4,5,8
mova m1, [t5+r1]
mova m2, [r0]
mova m3, [r0+r1]
call chroma_intra_body_%1
call chroma_intra_body
mova [t5+r1], m1
mova [r0], m2
CHROMA_V_LOOP 0
......@@ -1918,35 +1931,19 @@ cglobal deblock_v_chroma_intra, 4,5,8
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body_%1
call chroma_intra_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 0
RET
ALIGN 16
RESET_MM_PERMUTATION
chroma_intra_body_%1:
LOAD_MASK r2d, r3d
mova m5, m1
mova m6, m2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
psubb m2, m6
pand m1, m7
pand m2, m7
paddb m1, m5
paddb m2, m6
ret
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
DEBLOCK_CHROMA_INTRA sse2
DEBLOCK_CHROMA_INTRA
INIT_XMM avx
DEBLOCK_CHROMA_INTRA avx
DEBLOCK_CHROMA_INTRA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA mmx2
DEBLOCK_CHROMA_INTRA
%endif
%endif ; !HIGH_BIT_DEPTH
......
......@@ -430,7 +430,7 @@ AVG_WEIGHT 16, 7
%endif
%macro WEIGHTER 1
cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
......@@ -695,7 +695,7 @@ AVGH 4, 2
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
cglobal pixel_avg2_w%1, 6,7,4
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
......@@ -720,7 +720,7 @@ cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
%endmacro
%macro AVG2_W_TWO 3
cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16)
cglobal pixel_avg2_w%1, 6,7,8
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
......@@ -1203,7 +1203,7 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
; pixel copy
;=============================================================================
%macro COPY4 2-*
%macro COPY1 2
movu m0, [r2]
movu m1, [r2+r3]
movu m2, [r2+r3*2]
......@@ -1214,27 +1214,28 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
mova [r0+%1], m3
%endmacro
%macro COPY_ONE 4
COPY4 %1, %2
%macro COPY2 2-4 0, 1
movu m0, [r2+%3*mmsize]
movu m1, [r2+%4*mmsize]
movu m2, [r2+r3+%3*mmsize]
movu m3, [r2+r3+%4*mmsize]
movu m4, [r2+r3*2+%3*mmsize]
movu m5, [r2+r3*2+%4*mmsize]
movu m6, [r2+%2+%3*mmsize]
movu m7, [r2+%2+%4*mmsize]
mova [r0+%3*mmsize], m0
mova [r0+%4*mmsize], m1
mova [r0+r1+%3*mmsize], m2
mova [r0+r1+%4*mmsize], m3
mova [r0+r1*2+%3*mmsize], m4
mova [r0+r1*2+%4*mmsize], m5
mova [r0+%1+%3*mmsize], m6
mova [r0+%1+%4*mmsize], m7
%endmacro
%macro COPY_TWO 4
movu m0, [r2+%3]
movu m1, [r2+%4]
movu m2, [r2+r3+%3]
movu m3, [r2+r3+%4]
movu m4, [r2+r3*2+%3]
movu m5, [r2+r3*2+%4]
movu m6, [r2+%2+%3]
movu m7, [r2+%2+%4]
mova [r0+%3], m0
mova [r0+%4], m1
mova [r0+r1+%3], m2
mova [r0+r1+%4], m3
mova [r0+r1*2+%3], m4
mova [r0+r1*2+%4], m5
mova [r0+%1+%3], m6
mova [r0+%1+%4], m7
%macro COPY4 2
COPY2 %1, %2, 0, 1
COPY2 %1, %2, 2, 3
%endmacro
;-----------------------------------------------------------------------------
......@@ -1252,76 +1253,38 @@ cglobal mc_copy_w4_mmx, 4,6
%define mova movd
%define movu movd
%endif
COPY4 r4, r5
COPY1 r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
.end:
COPY4 r4, r5
COPY1 r4, r5
RET
%ifdef HIGH_BIT_DEPTH
cglobal mc_copy_w16_mmx, 5,7
%macro MC_COPY 1
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
cglobal mc_copy_w%1, 5,7,8*(%%w/2)
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
COPY_TWO r5, r6, mmsize*0, mmsize*1
COPY_TWO r5, r6, mmsize*2, mmsize*3
sub r4d, 4
COPY %+ %%w r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
jg .height_loop
REP_RET
%macro MC_COPY 2
cglobal mc_copy_w%2, 5,7,%2-8
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
COPY_%1 r5, r6, 0, mmsize
sub r4d, 4
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
jg .height_loop
REP_RET
%endif
%endmacro
INIT_MMX mmx
MC_COPY TWO, 8
INIT_XMM sse2
MC_COPY ONE, 8
MC_COPY TWO, 16
INIT_XMM aligned, sse2
MC_COPY TWO, 16
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
%macro MC_COPY 2
cglobal mc_copy_w%2, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
%1 r5, r6, 0, mmsize
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r4d, 4
jg .height_loop
REP_RET
%endmacro
INIT_MMX mmx
MC_COPY COPY4, 8
MC_COPY COPY_TWO, 16
MC_COPY 8
MC_COPY 16
INIT_XMM sse2
MC_COPY COPY4, 16
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
INIT_XMM sse3
MC_COPY COPY4, 16
MC_COPY 8
MC_COPY 16
INIT_XMM aligned, sse2
MC_COPY COPY4, 16
%endif ; !HIGH_BIT_DEPTH
MC_COPY 16
......
......@@ -151,7 +151,7 @@ cextern pd_ffff
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11*(mmsize/16)
cglobal hpel_filter_v, 5,6,11
FIX_STRIDES r3d, r4d
%ifdef WIN64
movsxd r4, r4d
......@@ -211,7 +211,7 @@ cglobal hpel_filter_v, 5,6,11*(mmsize/16)
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,10*(mmsize/16)
cglobal hpel_filter_c, 3,3,10
add r2, r2
add r0, r2
lea r1, [r1+r2]
......@@ -260,7 +260,7 @@ cglobal hpel_filter_c, 3,3,10*(mmsize/16)
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h, 3,4,8*(mmsize/16)
cglobal hpel_filter_h, 3,4,8
%define src r1+r2
add r2, r2
add r0, r2
......@@ -370,6 +370,7 @@ cglobal hpel_filter_v, 5,6,%1
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
add r0, r2
lea r1, [r1+r2*2]
......@@ -1480,7 +1481,7 @@ cglobal integral_init4v_ssse3, 3,5
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%ifdef HIGH_BIT_DEPTH
shl dword r6m, 1
FIX_STRIDES r5d
......
......@@ -85,7 +85,6 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w8_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmx2( uint8_t *, int, int );
......
......@@ -75,7 +75,7 @@ cextern hsub_mul
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
cglobal pixel_ssd_%1x%2, 4,5,6*(mmsize/16)
cglobal pixel_ssd_%1x%2, 4,5,6
mov r4, %1*%2/mmsize
pxor m0, m0
.loop
......@@ -306,11 +306,7 @@ cglobal pixel_ssd_%1x%2, 0,0,0
.startloop:
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
%if cpuflag(ssse3) ; FIXME wrong, but correcting this modifies the binary
PROLOGUE 0,0,8
%else
PROLOGUE 0,0,8*(mmsize/16)
%endif
%else
PROLOGUE 0,5
DECLARE_REG_TMP 1,2,3,4
......@@ -402,7 +398,7 @@ SSD 4, 8
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7,7*(mmsize/16)
cglobal pixel_ssd_nv12_core, 6,7,7
shl r4d, 2
FIX_STRIDES r1, r3
add r0, r4
......@@ -1575,7 +1571,7 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16
ABS2 m10, m11, m12, m13
paddusw m8, m10
paddusw m9, m11
%ifidn cpuname, ssse3
%if cpuflag(ssse3)
pabsw m10, m6
pabsw m11, m7
pabsw m15, m1
......
......@@ -260,7 +260,7 @@ cglobal predict_4x4_ddr, 1,1
%endrep
RET
cglobal predict_4x4_vr, 1,1,6*(mmsize/16)
cglobal predict_4x4_vr, 1,1,6
movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
mova m5, m0
%ifdef HIGH_BIT_DEPTH
......@@ -296,7 +296,7 @@ cglobal predict_4x4_vr, 1,1,6*(mmsize/16)
<