Commit c6e72b86 authored by Anton Mitrofanov's avatar Anton Mitrofanov Committed by Fiona Glaser

Windows 64-bit support

A "make distclean" is probably required after updating to this revision.
parent ef48e51d
......@@ -74,8 +74,8 @@ DEP = depend
default: $(DEP) x264$(EXE)
libx264.a: .depend $(OBJS) $(OBJASM)
ar rc libx264.a $(OBJS) $(OBJASM)
ranlib libx264.a
$(AR) rc libx264.a $(OBJS) $(OBJASM)
$(RANLIB) libx264.a
$(SONAME): .depend $(OBJS) $(OBJASM)
$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
......@@ -89,7 +89,7 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile
-@ strip -x $@
-@ $(STRIP) -x $@
.depend: config.mak
rm -f .depend
......@@ -135,7 +135,7 @@ endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe tools/checkasm.o
rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
......@@ -150,7 +150,7 @@ install: x264$(EXE) $(SONAME)
install -m 644 libx264.a $(DESTDIR)$(libdir)
install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
install x264$(EXE) $(DESTDIR)$(bindir)
ranlib $(DESTDIR)$(libdir)/libx264.a
$(RANLIB) $(DESTDIR)$(libdir)/libx264.a
ifeq ($(SYS),MINGW)
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir))
else
......
......@@ -710,7 +710,7 @@ void *x264_malloc( int i_size )
buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
sizeof( int ) );
align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
align_buf -= (long) align_buf & 15;
align_buf -= (intptr_t) align_buf & 15;
*( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
*( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
return align_buf;
......
......@@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{
int64_t i_ssd = 0;
int x, y;
int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
......
......@@ -32,7 +32,10 @@ cextern x264_cabac_transition
cextern x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64
%ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6,10
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
......@@ -67,10 +70,10 @@ endstruc
%endmacro
cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0d, r0m
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [r0+cb.range]
movzx t3d, byte [r0+cb.state+t1]
mov t5d, [t0+cb.range]
movzx t3d, byte [t0+cb.state+t1]
mov t4d, t5d
shr t5d, 6
and t5d, 3
......@@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7
shr t6d, 6
movifnidn t2d, r2m
cmp t6d, t2d
mov t6d, [r0+cb.low]
mov t6d, [t0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b
mov [t0+cb.state+t1], t3b
.renorm:
mov t3d, t4d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [r0+cb.queue]
mov [r0+cb.range], t4d
mov [r0+cb.low], t6d
mov [r0+cb.queue], t3d
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
mov [t0+cb.low], t6d
mov [t0+cb.queue], t3d
cmp t3d, 8
jge .putbyte
REP_RET
......@@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7
sub t3d, 10
and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster?
mov [r0+cb.queue], t3d
mov [r0+cb.low], t6d
mov [t0+cb.queue], t3d
mov [t0+cb.low], t6d
mov t1d, t2d
mov t4, [r0+cb.p]
mov t4, [t0+cb.p]
je .postpone
mov t5d, [r0+cb.bytes_outstanding]
mov t5d, [t0+cb.bytes_outstanding]
shr t1d, 8 ; carry
add [t4-1], t1b
test t5d, t5d
......@@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7
.no_outstanding:
mov [t4], t2b
inc t4
mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [r0+cb.p], t4
mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [t0+cb.p], t4
RET
.postpone:
inc dword [r0+cb.bytes_outstanding]
inc dword [t0+cb.bytes_outstanding]
RET
......@@ -27,22 +27,24 @@
SECTION .text
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid
cglobal x264_cpu_cpuid, 5,7
push rbx
mov r10, r3
mov r11, r2
mov r9, r1
mov r11, r1
mov r10, r2
movifnidn r9, r3
movifnidn r8, r4
mov eax, r0d
cpuid
mov [r9], eax
mov [r11], ebx
mov [r10], ecx
mov [r11], eax
mov [r10], ebx
mov [r9], ecx
mov [r8], edx
pop rbx
ret
RET
%else
......@@ -102,6 +104,7 @@ cglobal x264_stack_align
call ecx
leave
ret
%endif
;-----------------------------------------------------------------------------
......
......@@ -189,7 +189,7 @@ dct8_mmx:
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
global x264_sub8x8_dct8_mmx %+ .skip_prologue
global x264_sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
......@@ -255,7 +255,7 @@ idct8_mmx:
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2
global x264_add8x8_idct8_mmx %+ .skip_prologue
global x264_add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
......@@ -348,7 +348,7 @@ INIT_XMM
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue
global x264_sub8x8_dct8_sse2.skip_prologue
.skip_prologue:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
......@@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
global x264_add8x8_idct8_sse2 %+ .skip_prologue
global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
......
......@@ -86,7 +86,7 @@ INIT_XMM
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2
cglobal x264_sub8x8_dct8_sse2, 3,3,10
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
......@@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
ret
RET
%macro IDCT8_1D 10
......@@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2
cglobal x264_add8x8_idct8_sse2, 2,2,10
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
......@@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
ret
RET
......@@ -155,12 +155,16 @@ cglobal x264_add4x4_idct_mmx, 2,2
INIT_XMM
cglobal x264_sub8x8_dct_sse2, 3,3
cglobal x264_sub8x8_dct_sse2, 3,3,8
.skip_prologue:
call .8x4
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4:
SUB_DCT4 2x4x4W
movhps [r0+32], m0
......@@ -169,11 +173,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3
movhps [r0+56], m3
ret
cglobal x264_add8x8_idct_sse2, 2,2
cglobal x264_add8x8_idct_sse2, 2,2,8
.skip_prologue:
call .8x4
add r1, 64
add r0, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
......@@ -192,6 +200,9 @@ cglobal x264_add8x8_idct_sse2, 2,2
%macro SUB_NxN_DCT 6
cglobal %1, 3,3
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
......@@ -204,6 +215,9 @@ cglobal %1, 3,3
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
add rsp, 8
%endif
jmp %2
%endmacro
......@@ -213,6 +227,9 @@ cglobal %1, 3,3
%macro ADD_NxN_IDCT 6
cglobal %1, 2,2
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
......@@ -222,25 +239,30 @@ cglobal %1, 2,2
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
%ifdef WIN64
add rsp, 8
%endif
jmp %2
%endmacro
%ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue
%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
......@@ -286,7 +308,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
ADD_DC mm2, mm3, r0
ret
RET
cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
......@@ -324,7 +346,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps [r0+FDEC_STRIDE* 1], xmm3
movhps [r0+FDEC_STRIDE* 2], xmm4
movhps [r0+FDEC_STRIDE* 3], xmm5
ret
RET
cglobal x264_add16x16_idct_dc_mmx, 2,3
mov r2, 4
......@@ -348,7 +370,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
add r0, FDEC_STRIDE*4
dec r2
jg .loop
ret
REP_RET
%macro IDCT_DC_STORE 3
movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
......@@ -369,9 +391,13 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
cglobal x264_add16x16_idct_dc_sse2, 2,2
cglobal x264_add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop:
add r0, FDEC_STRIDE*4
movq xmm0, [r1+0]
......@@ -399,9 +425,13 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2
IDCT_DC_STORE 0, xmm2, xmm3
ret
cglobal x264_add16x16_idct_dc_ssse3, 2,2
cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop:
add r0, FDEC_STRIDE*4
movdqa xmm0, [r1]
......@@ -428,7 +458,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
......@@ -703,7 +733,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
......
......@@ -278,7 +278,7 @@ SECTION .text
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal x264_deblock_v_luma_sse2
cglobal x264_deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
......@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
ret
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_sse2
movsxd r10, esi
cglobal x264_deblock_h_luma_sse2, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea rax, [r0-4]
lea r9, [r0-4+r11]
lea r6, [r0-4]
lea r5, [r0-4+r11]
%ifdef WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
sub rsp, 0x68
%define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
lea rax, [rax+r10*8]
lea r9, [r9 +r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov esi, 0x10
mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add rax, 2
add r9, 2
add r6, 2
add r5, 2
movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub rax, r10
sub r9, r10
sub r6, r10
sub r5, r10
shr r10, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
%else
add rsp, 0x68
ret
%endif
RET
%else
......@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
mov r3, r4m
mov r3, r4mp
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
......@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5
mov r0, r0m
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
......@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
ADD esp, 20
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov r0, r0m
mov r0, r0mp
sub r0, 2
lea r1, [r0+r4]
......@@ -609,7 +621,7 @@ DEBLOCK_LUMA sse2, v, 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_intra_%1, 4,6
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
......@@ -671,34 +683,34 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_h_luma_intra_%1
cglobal x264_deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea rax, [r0-4]
lea r9, [r0-4+r11]
lea r6, [r0-4]
lea r5, [r0-4+r11]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea rax, [rax+r10*8]
lea r9, [r9+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call x264_deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r9, [rax+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
lea r5, [r6+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub rax, r10
sub r9, r10
sub r6, r10
sub r5, r10
shr r10, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
add rsp, 0x88
ret
RET
%else
cglobal x264_deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
......@@ -727,7 +739,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
ADD esp, 16
mov r1, r1m
mov r0, r0m
mov r0, r0mp
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
......
......@@ -42,14 +42,17 @@ SECTION .text
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
%macro AVG_START 0
PROLOGUE 6,7
%macro AVG_START 0-1 0
PROLOGUE 6,7,%1
%ifdef WIN64
movsxd r5, r5d
%endif
.height_loop:
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
%macro AVG_START 0
PROLOGUE 0,7
%macro AVG_START 0-1 0
PROLOGUE 0,7,%1
mov t0, r0m
mov t1, r1m
mov t2, r2m
......@@ -72,30 +75,30 @@ SECTION .text
%macro BIWEIGHT_MMX 2
movh m0, %1
movh m1, %2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, m2
pmullw m1, m3
paddw m0, m1
paddw m0, m6
paddw m0, m4
psraw m0, 6
%endmacro
%macro BIWEIGHT_START_MMX 0
movd m4, r6m
SPLATW m4, m4 ; weight_dst
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
pxor m7, m7
movd m2, r6m
SPLATW m2, m2 ; weight_dst
mova m3, [pw_64 GLOBAL]
psubw m3, m2 ; weight_src
mova m4, [pw_32 GLOBAL] ; rounding
pxor m5, m5
%endmacro
%macro BIWEIGHT_SSSE3 2
movh m0, %1
movh m1, %2
punpcklbw m0, m1
pmaddubsw m0, m5
paddw m0, m6
pmaddubsw m0, m3
paddw m0, m4
psraw m0, 6
%endmacro
......@@ -105,9 +108,9 @@ SECTION .text
sub t7d, t6d
shl t7d, 8
add t6d, t7d
movd m5, t6d
mova m6, [pw_32 GLOBAL]
SPLATW m5, m5 ; weight_dst,src
movd m3, t6d
mova m4, [pw_32 GLOBAL]
SPLATW m3, m3 ; weight_dst,src
%endmacro
%macro BIWEIGHT_ROW 4
......@@ -116,27 +119,27 @@ SECTION .text
packuswb m0, m0
movh [%1], m0
%else
SWAP 0, 2
SWAP 0, 6
BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
packuswb m2, m0
mova [%1], m2
packuswb m6, m0
mova [%1], m6
%endif
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 2
cglobal x264_pixel_avg_weight_w%2_%1, 0,0
%macro AVG_WEIGHT 2-3 0
cglobal x264_pixel_avg_weight_w%2_%1
BIWEIGHT_START
AVG_START
AVG_START %3
%if %2==8 && mmsize==16
BIWEIGHT [t2], [t4]
SWAP 0, 2
SWAP 0, 6
BIWEIGHT [t2+t3], [t4+t5]
packuswb m2, m0
movlps [t0], m2
movhps [t0+t1], m2
packuswb m6, m0
movlps [t0], m6
movhps [t0+t1], m6
%else
%assign x 0
%rep 1+%2/(mmsize*2)
......@@ -161,15 +164,15 @@ AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
%define x264_pixel_avg_weight_w4_sse2