Commit c6e72b86 authored by Anton Mitrofanov's avatar Anton Mitrofanov Committed by Fiona Glaser

Windows 64-bit support

A "make distclean" is probably required after updating to this revision.
parent ef48e51d
......@@ -74,8 +74,8 @@ DEP = depend
default: $(DEP) x264$(EXE)
libx264.a: .depend $(OBJS) $(OBJASM)
ar rc libx264.a $(OBJS) $(OBJASM)
ranlib libx264.a
$(AR) rc libx264.a $(OBJS) $(OBJASM)
$(RANLIB) libx264.a
$(SONAME): .depend $(OBJS) $(OBJASM)
$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
......@@ -89,7 +89,7 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile
-@ strip -x $@
-@ $(STRIP) -x $@
.depend: config.mak
rm -f .depend
......@@ -135,7 +135,7 @@ endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe tools/checkasm.o
rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
......@@ -150,7 +150,7 @@ install: x264$(EXE) $(SONAME)
install -m 644 libx264.a $(DESTDIR)$(libdir)
install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
install x264$(EXE) $(DESTDIR)$(bindir)
ranlib $(DESTDIR)$(libdir)/libx264.a
$(RANLIB) $(DESTDIR)$(libdir)/libx264.a
ifeq ($(SYS),MINGW)
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir))
else
......
......@@ -710,7 +710,7 @@ void *x264_malloc( int i_size )
buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
sizeof( int ) );
align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
align_buf -= (long) align_buf & 15;
align_buf -= (intptr_t) align_buf & 15;
*( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
*( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
return align_buf;
......
......@@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{
int64_t i_ssd = 0;
int x, y;
int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
......
......@@ -32,7 +32,10 @@ cextern x264_cabac_transition
cextern x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64
%ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6,10
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
......@@ -67,10 +70,10 @@ endstruc
%endmacro
cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0d, r0m
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [r0+cb.range]
movzx t3d, byte [r0+cb.state+t1]
mov t5d, [t0+cb.range]
movzx t3d, byte [t0+cb.state+t1]
mov t4d, t5d
shr t5d, 6
and t5d, 3
......@@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7
shr t6d, 6
movifnidn t2d, r2m
cmp t6d, t2d
mov t6d, [r0+cb.low]
mov t6d, [t0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b
mov [t0+cb.state+t1], t3b
.renorm:
mov t3d, t4d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [r0+cb.queue]
mov [r0+cb.range], t4d
mov [r0+cb.low], t6d
mov [r0+cb.queue], t3d
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
mov [t0+cb.low], t6d
mov [t0+cb.queue], t3d
cmp t3d, 8
jge .putbyte
REP_RET
......@@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7
sub t3d, 10
and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster?
mov [r0+cb.queue], t3d
mov [r0+cb.low], t6d
mov [t0+cb.queue], t3d
mov [t0+cb.low], t6d
mov t1d, t2d
mov t4, [r0+cb.p]
mov t4, [t0+cb.p]
je .postpone
mov t5d, [r0+cb.bytes_outstanding]
mov t5d, [t0+cb.bytes_outstanding]
shr t1d, 8 ; carry
add [t4-1], t1b
test t5d, t5d
......@@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7
.no_outstanding:
mov [t4], t2b
inc t4
mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [r0+cb.p], t4
mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [t0+cb.p], t4
RET
.postpone:
inc dword [r0+cb.bytes_outstanding]
inc dword [t0+cb.bytes_outstanding]
RET
......@@ -27,22 +27,24 @@
SECTION .text
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid
cglobal x264_cpu_cpuid, 5,7
push rbx
mov r10, r3
mov r11, r2
mov r9, r1
mov r11, r1
mov r10, r2
movifnidn r9, r3
movifnidn r8, r4
mov eax, r0d
cpuid
mov [r9], eax
mov [r11], ebx
mov [r10], ecx
mov [r11], eax
mov [r10], ebx
mov [r9], ecx
mov [r8], edx
pop rbx
ret
RET
%else
......@@ -102,6 +104,7 @@ cglobal x264_stack_align
call ecx
leave
ret
%endif
;-----------------------------------------------------------------------------
......
......@@ -189,7 +189,7 @@ dct8_mmx:
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
global x264_sub8x8_dct8_mmx %+ .skip_prologue
global x264_sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
......@@ -255,7 +255,7 @@ idct8_mmx:
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2
global x264_add8x8_idct8_mmx %+ .skip_prologue
global x264_add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
......@@ -348,7 +348,7 @@ INIT_XMM
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue
global x264_sub8x8_dct8_sse2.skip_prologue
.skip_prologue:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
......@@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
global x264_add8x8_idct8_sse2 %+ .skip_prologue
global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
......
......@@ -86,7 +86,7 @@ INIT_XMM
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2
cglobal x264_sub8x8_dct8_sse2, 3,3,10
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
......@@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
ret
RET
%macro IDCT8_1D 10
......@@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2
cglobal x264_add8x8_idct8_sse2, 2,2,10
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
......@@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
ret
RET
......@@ -155,12 +155,16 @@ cglobal x264_add4x4_idct_mmx, 2,2
INIT_XMM
cglobal x264_sub8x8_dct_sse2, 3,3
cglobal x264_sub8x8_dct_sse2, 3,3,8
.skip_prologue:
call .8x4
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4:
SUB_DCT4 2x4x4W
movhps [r0+32], m0
......@@ -169,11 +173,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3
movhps [r0+56], m3
ret
cglobal x264_add8x8_idct_sse2, 2,2
cglobal x264_add8x8_idct_sse2, 2,2,8
.skip_prologue:
call .8x4
add r1, 64
add r0, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
......@@ -192,6 +200,9 @@ cglobal x264_add8x8_idct_sse2, 2,2
%macro SUB_NxN_DCT 6
cglobal %1, 3,3
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
......@@ -204,6 +215,9 @@ cglobal %1, 3,3
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
add rsp, 8
%endif
jmp %2
%endmacro
......@@ -213,6 +227,9 @@ cglobal %1, 3,3
%macro ADD_NxN_IDCT 6
cglobal %1, 2,2
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
......@@ -222,25 +239,30 @@ cglobal %1, 2,2
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
%ifdef WIN64
add rsp, 8
%endif
jmp %2
%endmacro
%ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue
%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
......@@ -286,7 +308,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
ADD_DC mm2, mm3, r0
ret
RET
cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
......@@ -324,7 +346,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps [r0+FDEC_STRIDE* 1], xmm3
movhps [r0+FDEC_STRIDE* 2], xmm4
movhps [r0+FDEC_STRIDE* 3], xmm5
ret
RET
cglobal x264_add16x16_idct_dc_mmx, 2,3
mov r2, 4
......@@ -348,7 +370,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
add r0, FDEC_STRIDE*4
dec r2
jg .loop
ret
REP_RET
%macro IDCT_DC_STORE 3
movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
......@@ -369,9 +391,13 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
cglobal x264_add16x16_idct_dc_sse2, 2,2
cglobal x264_add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop:
add r0, FDEC_STRIDE*4
movq xmm0, [r1+0]
......@@ -399,9 +425,13 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2
IDCT_DC_STORE 0, xmm2, xmm3
ret
cglobal x264_add16x16_idct_dc_ssse3, 2,2
cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop:
add r0, FDEC_STRIDE*4
movdqa xmm0, [r1]
......@@ -428,7 +458,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
......@@ -703,7 +733,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
......
......@@ -278,7 +278,7 @@ SECTION .text
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal x264_deblock_v_luma_sse2
cglobal x264_deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
......@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
ret
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_sse2
movsxd r10, esi
cglobal x264_deblock_h_luma_sse2, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea rax, [r0-4]
lea r9, [r0-4+r11]
lea r6, [r0-4]
lea r5, [r0-4+r11]
%ifdef WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
sub rsp, 0x68
%define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
lea rax, [rax+r10*8]
lea r9, [r9 +r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov esi, 0x10
mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add rax, 2
add r9, 2
add r6, 2
add r5, 2
movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub rax, r10
sub r9, r10
sub r6, r10
sub r5, r10
shr r10, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
%else
add rsp, 0x68
ret
%endif
RET
%else
......@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
mov r3, r4m
mov r3, r4mp
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
......@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5
mov r0, r0m
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
......@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
ADD esp, 20
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov r0, r0m
mov r0, r0mp
sub r0, 2
lea r1, [r0+r4]
......@@ -609,7 +621,7 @@ DEBLOCK_LUMA sse2, v, 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_intra_%1, 4,6
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
......@@ -671,34 +683,34 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_h_luma_intra_%1
cglobal x264_deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea rax, [r0-4]
lea r9, [r0-4+r11]
lea r6, [r0-4]
lea r5, [r0-4+r11]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea rax, [rax+r10*8]
lea r9, [r9+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call x264_deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r9, [rax+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
lea r5, [r6+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub rax, r10
sub r9, r10
sub r6, r10
sub r5, r10
shr r10, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
add rsp, 0x88
ret
RET
%else
cglobal x264_deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
......@@ -727,7 +739,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
ADD esp, 16
mov r1, r1m
mov r0, r0m
mov r0, r0mp
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
......
......@@ -42,14 +42,17 @@ SECTION .text
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
%macro AVG_START 0
PROLOGUE 6,7
%macro AVG_START 0-1 0
PROLOGUE 6,7,%1
%ifdef WIN64
movsxd r5, r5d
%endif
.height_loop:
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
%macro AVG_START 0
PROLOGUE 0,7
%macro AVG_START 0-1 0
PROLOGUE 0,7,%1
mov t0, r0m
mov t1, r1m
mov t2, r2m
......@@ -72,30 +75,30 @@ SECTION .text
%macro BIWEIGHT_MMX 2
movh m0, %1
movh m1, %2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, m2