Commit 4cf27285 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

x86inc: activate REP_RET automatically

Now RET checks whether it immediately follows a branch, so the programmer dosen't have to keep track of that condition.
REP_RET is still needed manually when it's a branch target, but that's much rarer.
The implementation involves lots of spurious labels, but that's ok because we strip them.
parent b073e870
......@@ -139,13 +139,13 @@ cglobal cabac_encode_terminal_asm, 0,3
; can only be 0 or 1 and is zero over 99% of the time.
test dword [t0+cb.range], 0x100
je .renorm
REP_RET
RET
.renorm:
shl dword [t0+cb.low], 1
shl dword [t0+cb.range], 1
inc dword [t0+cb.queue]
jge .putbyte
REP_RET
RET
.putbyte:
PROLOGUE 0,7
mov t3d, [t0+cb.queue]
......
......@@ -555,7 +555,7 @@ cglobal add16x16_idct_dc, 2,3,8
add r0, 4*FDEC_STRIDEB
dec r2
jg .loop
REP_RET
RET
%endmacro ; ADD_IDCT_DC
INIT_XMM sse2
......@@ -664,7 +664,7 @@ cglobal add16x16_idct_dc, 2,3
add r0, FDEC_STRIDE*4
dec r2
jg .loop
REP_RET
RET
INIT_XMM sse2
cglobal add16x16_idct_dc, 2,2,8
......
......@@ -378,7 +378,7 @@ cglobal deblock_v_luma, 5,5,15
add r4, 2
dec r3
jg .loop
REP_RET
RET
cglobal deblock_h_luma, 5,7,15
add r1, r1
......@@ -416,7 +416,7 @@ cglobal deblock_h_luma, 5,7,15
lea r5, [r5+r1*8]
dec r6
jg .loop
REP_RET
RET
%endmacro
INIT_XMM sse2
......@@ -650,7 +650,7 @@ cglobal deblock_v_luma_intra, 4,7,16
add r4, mmsize
dec r6
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
......@@ -1497,7 +1497,7 @@ cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
RET
REP_RET
INIT_MMX cpuname
%if ARCH_X86_64
......@@ -1687,7 +1687,7 @@ cglobal deblock_v_chroma, 5,7,8
add r4, mmsize/8
dec r6
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
......@@ -1706,7 +1706,7 @@ cglobal deblock_h_chroma, 5,7,8
add r4, mmsize/8
dec r5
jg .loop
REP_RET
RET
cglobal deblock_intra_body
......@@ -1734,7 +1734,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
add r4, mmsize
dec r5
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
......@@ -1752,7 +1752,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
......@@ -1775,7 +1775,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
dec r4
jg .loop
%endif
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
......@@ -1803,7 +1803,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
dec r5
jg .loop
%endif
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
......@@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
......@@ -1852,7 +1852,7 @@ cglobal deblock_h_chroma_422, 5,7,8
%endif
dec r5
jg .loop
REP_RET
RET
%endmacro ; DEBLOCK_CHROMA
%if ARCH_X86_64 == 0
......@@ -2020,7 +2020,7 @@ cglobal deblock_h_chroma_422, 5,8,8
add r4, mmsize/8
dec cntr
jg .loop
REP_RET
RET
%endmacro
INIT_MMX mmx2
......@@ -2101,7 +2101,7 @@ cglobal deblock_h_chroma_422_intra, 4,7,8
lea t5, [t5+r1*(mmsize/2)]
dec r6d
jg .loop
REP_RET
RET
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
......
......@@ -87,7 +87,7 @@ cextern pd_32
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
jg .height_loop
REP_RET
RET
%endmacro
%if HIGH_BIT_DEPTH
......@@ -415,7 +415,7 @@ cglobal mc_weight_w%1, 6,6,8
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
REP_RET
RET
%endmacro
INIT_MMX mmx2
......@@ -495,7 +495,7 @@ cglobal mc_offset%2_w%1, 6,6
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
REP_RET
RET
%endmacro
%macro OFFSETPN 1
......@@ -672,7 +672,7 @@ cglobal pixel_avg2_w%1, 6,7,4
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
REP_RET
RET
%endmacro
%macro AVG2_W_TWO 3
......@@ -707,7 +707,7 @@ cglobal pixel_avg2_w%1, 6,7,8
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
REP_RET
RET
%endmacro
INIT_MMX mmx2
......@@ -745,7 +745,7 @@ cglobal pixel_avg2_w10_mmx2, 6,7
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
cglobal pixel_avg2_w16_mmx2, 6,7
sub r4, r2
......@@ -779,7 +779,7 @@ cglobal pixel_avg2_w16_mmx2, 6,7
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
cglobal pixel_avg2_w18_mmx2, 6,7
sub r4, r2
......@@ -803,7 +803,7 @@ cglobal pixel_avg2_w18_mmx2, 6,7
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
REP_RET
RET
INIT_XMM
cglobal pixel_avg2_w18_sse2, 6,7,6
......@@ -825,7 +825,7 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
REP_RET
RET
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
......@@ -849,7 +849,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
%endmacro
INIT_MMX
......@@ -877,7 +877,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
%endmacro
AVG2_W16 12, movd
......@@ -909,7 +909,7 @@ cglobal pixel_avg2_w20_mmx2, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
......@@ -927,7 +927,7 @@ cglobal pixel_avg2_w16_sse2, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
%macro AVG2_W20 1
cglobal pixel_avg2_w20_%1, 6,7
......@@ -959,7 +959,7 @@ cglobal pixel_avg2_w20_%1, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
REP_RET
RET
%endmacro
AVG2_W20 sse2
......@@ -1022,7 +1022,7 @@ pixel_avg2_w%1_cache_mmx2:
add r0, r1
dec r5d
jg .height_loop
REP_RET
RET
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
......@@ -1226,7 +1226,7 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2)
lea r0, [r0+r1*4]
sub r4d, 4
jg .height_loop
REP_RET
RET
%endif
%endmacro
......@@ -1506,7 +1506,7 @@ ALIGN 4
add r1, r2
dec r5d
jg .loop2
REP_RET
RET
%if mmsize==8
.width4:
......@@ -1626,11 +1626,11 @@ ALIGN 4
dec r5d
jg .loop4
%if mmsize!=8
REP_RET
RET
%else
sub dword r7m, 4
jg .width8
REP_RET
RET
.width8:
%if ARCH_X86_64
lea r3, [t2+8*SIZEOF_PIXEL]
......@@ -1766,7 +1766,7 @@ ALIGN 4
add r1, r2
dec r5d
jg .loop1d_w4
REP_RET
RET
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
......@@ -1848,7 +1848,7 @@ cglobal mc_chroma
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop4
REP_RET
RET
.width8:
movu m0, [r3]
......@@ -1909,7 +1909,7 @@ cglobal mc_chroma
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop8
REP_RET
RET
%endmacro
%if HIGH_BIT_DEPTH
......
......@@ -210,7 +210,7 @@ cglobal hpel_filter_v, 5,6,11
mova [r0+r4+mmsize], m4
add r4, 2*mmsize
jl .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
......@@ -259,7 +259,7 @@ cglobal hpel_filter_c, 3,3,10
mova [r0+r2], m1
add r2, mmsize
jl .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
......@@ -302,7 +302,7 @@ cglobal hpel_filter_h, 3,4,8
mova [r0+r2+mmsize], m4
add r2, mmsize*2
jl .loop
REP_RET
RET
%endmacro ; HPEL_FILTER
INIT_MMX mmx2
......@@ -365,7 +365,7 @@ cglobal hpel_filter_v, 5,6,%1
add r5, mmsize
add r4, mmsize
jl .loop
REP_RET
RET
%endmacro
;-----------------------------------------------------------------------------
......@@ -396,7 +396,7 @@ cglobal hpel_filter_c_mmx2, 3,3
movntq [r0+r2], m1
add r2, 8
jl .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
......@@ -440,7 +440,7 @@ cglobal hpel_filter_h_mmx2, 3,3
movntq [r0+r2], m1
add r2, 8
jl .loop
REP_RET
RET
INIT_XMM
......@@ -510,7 +510,7 @@ cglobal hpel_filter_c, 3,3,9
movntps [r0+r2], m4
add r2, 16
jl .loop
REP_RET
RET
%endmacro
;-----------------------------------------------------------------------------
......@@ -559,7 +559,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
movntps [r0+r2], m1
add r2, 16
jl .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
......@@ -600,7 +600,7 @@ cglobal hpel_filter_h, 3,3
movntps [r0+r2], m3
add r2, 16
jl .loop
REP_RET
RET
%endmacro
INIT_MMX mmx2
......@@ -1026,7 +1026,7 @@ cglobal store_interleave_chroma, 5,5
lea r0, [r0+r1*2]
sub r4d, 2
jg .loop
REP_RET
RET
%endmacro ; PLANE_INTERLEAVE
%macro DEINTERLEAVE_START 0
......@@ -1068,7 +1068,7 @@ cglobal plane_copy_deinterleave, 6,7
add r4, r5
dec dword r7m
jg .loopy
REP_RET
RET
;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
......@@ -1083,7 +1083,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
REP_RET
RET
;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
......@@ -1098,7 +1098,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
REP_RET
RET
%endmacro ; PLANE_DEINTERLEAVE
%if HIGH_BIT_DEPTH
......@@ -1155,7 +1155,7 @@ cglobal memcpy_aligned_mmx, 3,3
sub r2d, 32
jg .copy32
.ret
REP_RET
RET
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
......@@ -1207,7 +1207,7 @@ cglobal memzero_aligned, 2,2
%endrep
add r1, mmsize*8
jl .loop
REP_RET
RET
%endmacro
INIT_MMX mmx
......@@ -1239,7 +1239,7 @@ cglobal integral_init4h_sse4, 3,4
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
REP_RET
RET
%macro INTEGRAL_INIT8H 0
cglobal integral_init8h, 3,4
......@@ -1263,7 +1263,7 @@ cglobal integral_init8h, 3,4
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
REP_RET
RET
%endmacro
INIT_XMM sse4
......@@ -1290,7 +1290,7 @@ cglobal integral_init8v, 3,3
mova [r0+r1+mmsize], m1
add r1, 2*mmsize
jl .loop
REP_RET
RET
%endmacro
INIT_MMX mmx
......@@ -1321,7 +1321,7 @@ cglobal integral_init4v_mmx, 3,5
mova [r1+r2-8], m3
sub r2, 8
jge .loop
REP_RET
RET
INIT_XMM
cglobal integral_init4v_sse2, 3,5
......@@ -1347,7 +1347,7 @@ cglobal integral_init4v_sse2, 3,5
mova [r1+r2], m3
add r2, 16
jl .loop
REP_RET
RET
cglobal integral_init4v_ssse3, 3,5
shl r2, 1
......@@ -1372,7 +1372,7 @@ cglobal integral_init4v_ssse3, 3,5
mova [r1+r2], m3
add r2, 16
jl .loop
REP_RET
RET
%macro FILT8x4 7
mova %3, [r0+%7]
......@@ -1732,7 +1732,7 @@ cglobal mbtree_propagate_cost, 7,7,7
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
REP_RET
RET
%endmacro
INIT_XMM sse2
......@@ -1786,4 +1786,4 @@ cglobal mbtree_propagate_cost, 7,7,8
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
REP_RET
RET
......@@ -807,7 +807,7 @@ cglobal predict_8x8_dc, 2,2
psrlw m0, 4
SPLATW m0, m0
STORE8x8 m0, m0
REP_RET
RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
......@@ -1103,7 +1103,7 @@ ALIGN 4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
RET
%endmacro ; PREDICT_CHROMA_P_MMX
INIT_MMX mmx2
......@@ -1140,7 +1140,7 @@ cglobal predict_8x%1c_p_core, 1,2,7
add r0, FDEC_STRIDEB
dec r1d
jg .loop
REP_RET
RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_8x%1c_p_core, 1,2
movd m0, r1m
......@@ -1225,7 +1225,7 @@ ALIGN 4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
RET
%endif ; !ARCH_X86_64
%macro PREDICT_16x16_P 0
......@@ -1282,7 +1282,7 @@ ALIGN 4
dec r1d
jg .loop
%endif ; !HIGH_BIT_DEPTH
REP_RET
RET
%endmacro ; PREDICT_16x16_P
INIT_XMM sse2
......@@ -1996,20 +1996,20 @@ cglobal predict_16x16_v_mmx2, 1,2
mova m2, [r0 - FDEC_STRIDEB+16]
mova m3, [r0 - FDEC_STRIDEB+24]
STORE16x16 m0, m1, m2, m3
REP_RET
RET
INIT_XMM
cglobal predict_16x16_v_sse2, 2,2
mova m0, [r0 - FDEC_STRIDEB+ 0]
mova m1, [r0 - FDEC_STRIDEB+16]
STORE16x16_SSE2 m0, m1
REP_RET
RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx2, 1,2
movq m0, [r0 - FDEC_STRIDE + 0]
movq m1, [r0 - FDEC_STRIDE + 8]
STORE16x16 m0, m1
REP_RET
RET
INIT_XMM
cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
......@@ -2055,7 +2055,7 @@ cglobal predict_16x16_h, 1,2
%endif ; HIGH_BIT_DEPTH
sub r1, 4*FDEC_STRIDEB
jge .vloop
REP_RET
RET
%endmacro
INIT_MMX mmx2
......@@ -2106,12 +2106,12 @@ cglobal predict_16x16_dc_core, 1,2
%else
PRED16x16_DC r1m, 5
%endif
REP_RET
RET
INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
REP_RET
RET
INIT_MMX mmx2
%if HIGH_BIT_DEPTH
......@@ -2119,14 +2119,14 @@ cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
STORE16x16 m0, m0
REP_RET
RET
%endif
;-----------------------------------------------------------------------------
......@@ -2159,11 +2159,11 @@ INIT_XMM sse2
cglobal predict_16x16_dc_core, 2,2,4
movd m3, r1m
PRED16x16_DC_SSE2 m3, 5
REP_RET
RET
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_SSE2 [pw_8], 4
REP_RET
RET
INIT_XMM sse2
%if HIGH_BIT_DEPTH
......@@ -2171,7 +2171,7 @@ cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
REP_RET
RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1