Commit c6e72b86 authored by Anton Mitrofanov's avatar Anton Mitrofanov Committed by Fiona Glaser

Windows 64-bit support

A "make distclean" is probably required after updating to this revision.
parent ef48e51d
...@@ -74,8 +74,8 @@ DEP = depend ...@@ -74,8 +74,8 @@ DEP = depend
default: $(DEP) x264$(EXE) default: $(DEP) x264$(EXE)
libx264.a: .depend $(OBJS) $(OBJASM) libx264.a: .depend $(OBJS) $(OBJASM)
ar rc libx264.a $(OBJS) $(OBJASM) $(AR) rc libx264.a $(OBJS) $(OBJASM)
ranlib libx264.a $(RANLIB) libx264.a
$(SONAME): .depend $(OBJS) $(OBJASM) $(SONAME): .depend $(OBJS) $(OBJASM)
$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS) $(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
...@@ -89,7 +89,7 @@ checkasm: tools/checkasm.o libx264.a ...@@ -89,7 +89,7 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.asm %.o: %.asm
$(AS) $(ASFLAGS) -o $@ $< $(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile # delete local/anonymous symbols, so they don't show up in oprofile
-@ strip -x $@ -@ $(STRIP) -x $@
.depend: config.mak .depend: config.mak
rm -f .depend rm -f .depend
...@@ -135,7 +135,7 @@ endif ...@@ -135,7 +135,7 @@ endif
clean: clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe tools/checkasm.o rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak - sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
...@@ -150,7 +150,7 @@ install: x264$(EXE) $(SONAME) ...@@ -150,7 +150,7 @@ install: x264$(EXE) $(SONAME)
install -m 644 libx264.a $(DESTDIR)$(libdir) install -m 644 libx264.a $(DESTDIR)$(libdir)
install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
install x264$(EXE) $(DESTDIR)$(bindir) install x264$(EXE) $(DESTDIR)$(bindir)
ranlib $(DESTDIR)$(libdir)/libx264.a $(RANLIB) $(DESTDIR)$(libdir)/libx264.a
ifeq ($(SYS),MINGW) ifeq ($(SYS),MINGW)
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir))
else else
......
...@@ -710,7 +710,7 @@ void *x264_malloc( int i_size ) ...@@ -710,7 +710,7 @@ void *x264_malloc( int i_size )
buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) + buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
sizeof( int ) ); sizeof( int ) );
align_buf = buf + 15 + sizeof( void ** ) + sizeof( int ); align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
align_buf -= (long) align_buf & 15; align_buf -= (intptr_t) align_buf & 15;
*( (void **) ( align_buf - sizeof( void ** ) ) ) = buf; *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
*( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size; *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
return align_buf; return align_buf;
......
...@@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 ...@@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{ {
int64_t i_ssd = 0; int64_t i_ssd = 0;
int x, y; int x, y;
int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15); int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 ); pix2 + y*i_pix2 + x, i_pix2 );
......
...@@ -32,7 +32,10 @@ cextern x264_cabac_transition ...@@ -32,7 +32,10 @@ cextern x264_cabac_transition
cextern x264_cabac_renorm_shift cextern x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift. ; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64 %ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6,10
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,10 DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq %define pointer resq
%else %else
...@@ -67,10 +70,10 @@ endstruc ...@@ -67,10 +70,10 @@ endstruc
%endmacro %endmacro
cglobal x264_cabac_encode_decision_asm, 0,7 cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0d, r0m movifnidn t0, r0mp
movifnidn t1d, r1m movifnidn t1d, r1m
mov t5d, [r0+cb.range] mov t5d, [t0+cb.range]
movzx t3d, byte [r0+cb.state+t1] movzx t3d, byte [t0+cb.state+t1]
mov t4d, t5d mov t4d, t5d
shr t5d, 6 shr t5d, 6
and t5d, 3 and t5d, 3
...@@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7 ...@@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7
shr t6d, 6 shr t6d, 6
movifnidn t2d, r2m movifnidn t2d, r2m
cmp t6d, t2d cmp t6d, t2d
mov t6d, [r0+cb.low] mov t6d, [t0+cb.low]
lea t7, [t6+t4] lea t7, [t6+t4]
cmovne t4d, t5d cmovne t4d, t5d
cmovne t6d, t7d cmovne t6d, t7d
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2 LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
movifnidn t1d, r1m movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b mov [t0+cb.state+t1], t3b
.renorm: .renorm:
mov t3d, t4d mov t3d, t4d
shr t3d, 3 shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b shl t4d, t3b
shl t6d, t3b shl t6d, t3b
add t3d, [r0+cb.queue] add t3d, [t0+cb.queue]
mov [r0+cb.range], t4d mov [t0+cb.range], t4d
mov [r0+cb.low], t6d mov [t0+cb.low], t6d
mov [r0+cb.queue], t3d mov [t0+cb.queue], t3d
cmp t3d, 8 cmp t3d, 8
jge .putbyte jge .putbyte
REP_RET REP_RET
...@@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7 ...@@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7
sub t3d, 10 sub t3d, 10
and t6d, t1d and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster? cmp t2b, 0xff ; FIXME is a 32bit op faster?
mov [r0+cb.queue], t3d mov [t0+cb.queue], t3d
mov [r0+cb.low], t6d mov [t0+cb.low], t6d
mov t1d, t2d mov t1d, t2d
mov t4, [r0+cb.p] mov t4, [t0+cb.p]
je .postpone je .postpone
mov t5d, [r0+cb.bytes_outstanding] mov t5d, [t0+cb.bytes_outstanding]
shr t1d, 8 ; carry shr t1d, 8 ; carry
add [t4-1], t1b add [t4-1], t1b
test t5d, t5d test t5d, t5d
...@@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7 ...@@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7
.no_outstanding: .no_outstanding:
mov [t4], t2b mov [t4], t2b
inc t4 inc t4
mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [r0+cb.p], t4 mov [t0+cb.p], t4
RET RET
.postpone: .postpone:
inc dword [r0+cb.bytes_outstanding] inc dword [t0+cb.bytes_outstanding]
RET RET
...@@ -27,22 +27,24 @@ ...@@ -27,22 +27,24 @@
SECTION .text SECTION .text
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid cglobal x264_cpu_cpuid, 5,7
push rbx push rbx
mov r10, r3 mov r11, r1
mov r11, r2 mov r10, r2
mov r9, r1 movifnidn r9, r3
movifnidn r8, r4
mov eax, r0d mov eax, r0d
cpuid cpuid
mov [r9], eax mov [r11], eax
mov [r11], ebx mov [r10], ebx
mov [r10], ecx mov [r9], ecx
mov [r8], edx mov [r8], edx
pop rbx pop rbx
ret RET
%else %else
...@@ -102,6 +104,7 @@ cglobal x264_stack_align ...@@ -102,6 +104,7 @@ cglobal x264_stack_align
call ecx call ecx
leave leave
ret ret
%endif %endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
......
...@@ -189,7 +189,7 @@ dct8_mmx: ...@@ -189,7 +189,7 @@ dct8_mmx:
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3 cglobal x264_sub8x8_dct8_mmx, 3,3
global x264_sub8x8_dct8_mmx %+ .skip_prologue global x264_sub8x8_dct8_mmx.skip_prologue
.skip_prologue: .skip_prologue:
INIT_MMX INIT_MMX
call load_diff_4x8_mmx call load_diff_4x8_mmx
...@@ -255,7 +255,7 @@ idct8_mmx: ...@@ -255,7 +255,7 @@ idct8_mmx:
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) ; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2 cglobal x264_add8x8_idct8_mmx, 2,2
global x264_add8x8_idct8_mmx %+ .skip_prologue global x264_add8x8_idct8_mmx.skip_prologue
.skip_prologue: .skip_prologue:
INIT_MMX INIT_MMX
add word [r1], 32 add word [r1], 32
...@@ -348,7 +348,7 @@ INIT_XMM ...@@ -348,7 +348,7 @@ INIT_XMM
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3 cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue global x264_sub8x8_dct8_sse2.skip_prologue
.skip_prologue: .skip_prologue:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
...@@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue ...@@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2 cglobal x264_add8x8_idct8_sse2, 2,2
global x264_add8x8_idct8_sse2 %+ .skip_prologue global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue: .skip_prologue:
UNSPILL r1, 1,2,3,5,6,7 UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1 IDCT8_1D 0,1,2,3,4,5,6,7,r1
......
...@@ -86,7 +86,7 @@ INIT_XMM ...@@ -86,7 +86,7 @@ INIT_XMM
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2 cglobal x264_sub8x8_dct8_sse2, 3,3,10
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
...@@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2 ...@@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2
movdqa [r0+0x50], m5 movdqa [r0+0x50], m5
movdqa [r0+0x60], m6 movdqa [r0+0x60], m6
movdqa [r0+0x70], m7 movdqa [r0+0x70], m7
ret RET
%macro IDCT8_1D 10 %macro IDCT8_1D 10
...@@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2 ...@@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2 cglobal x264_add8x8_idct8_sse2, 2,2,10
movdqa m0, [r1+0x00] movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10] movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20] movdqa m2, [r1+0x20]
...@@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2 ...@@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE] STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE] STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE] STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
ret RET
...@@ -155,12 +155,16 @@ cglobal x264_add4x4_idct_mmx, 2,2 ...@@ -155,12 +155,16 @@ cglobal x264_add4x4_idct_mmx, 2,2
INIT_XMM INIT_XMM
cglobal x264_sub8x8_dct_sse2, 3,3 cglobal x264_sub8x8_dct_sse2, 3,3,8
.skip_prologue: .skip_prologue:
call .8x4 call .8x4
add r0, 64 add r0, 64
add r1, 4*FENC_STRIDE add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE add r2, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4: .8x4:
SUB_DCT4 2x4x4W SUB_DCT4 2x4x4W
movhps [r0+32], m0 movhps [r0+32], m0
...@@ -169,11 +173,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3 ...@@ -169,11 +173,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3
movhps [r0+56], m3 movhps [r0+56], m3
ret ret
cglobal x264_add8x8_idct_sse2, 2,2 cglobal x264_add8x8_idct_sse2, 2,2,8
.skip_prologue: .skip_prologue:
call .8x4 call .8x4
add r1, 64 add r1, 64
add r0, 4*FDEC_STRIDE add r0, 4*FDEC_STRIDE
%ifdef WIN64
call .8x4
RET
%endif
.8x4: .8x4:
movq m0, [r1+ 0] movq m0, [r1+ 0]
movq m1, [r1+ 8] movq m1, [r1+ 8]
...@@ -192,6 +200,9 @@ cglobal x264_add8x8_idct_sse2, 2,2 ...@@ -192,6 +200,9 @@ cglobal x264_add8x8_idct_sse2, 2,2
%macro SUB_NxN_DCT 6 %macro SUB_NxN_DCT 6
cglobal %1, 3,3 cglobal %1, 3,3
.skip_prologue: .skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2 call %2
add r0, %3 add r0, %3
add r1, %4-%5-%6*FENC_STRIDE add r1, %4-%5-%6*FENC_STRIDE
...@@ -204,6 +215,9 @@ cglobal %1, 3,3 ...@@ -204,6 +215,9 @@ cglobal %1, 3,3
add r0, %3 add r0, %3
add r1, %4-%5-%6*FENC_STRIDE add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
add rsp, 8
%endif
jmp %2 jmp %2
%endmacro %endmacro
...@@ -213,6 +227,9 @@ cglobal %1, 3,3 ...@@ -213,6 +227,9 @@ cglobal %1, 3,3
%macro ADD_NxN_IDCT 6 %macro ADD_NxN_IDCT 6
cglobal %1, 2,2 cglobal %1, 2,2
.skip_prologue: .skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2 call %2
add r0, %4-%5-%6*FDEC_STRIDE add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3 add r1, %3
...@@ -222,25 +239,30 @@ cglobal %1, 2,2 ...@@ -222,25 +239,30 @@ cglobal %1, 2,2
call %2 call %2
add r0, %4-%5-%6*FDEC_STRIDE add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3 add r1, %3
%ifdef WIN64
add rsp, 8
%endif
jmp %2 jmp %2
%endmacro %endmacro
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern x264_sub8x8_dct8_mmx.skip_prologue cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue %define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue
%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif %endif
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4
cextern x264_sub8x8_dct8_sse2 cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2 cextern x264_add8x8_idct8_sse2
...@@ -286,7 +308,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2 ...@@ -286,7 +308,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
punpcklbw mm1, mm1 punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0-FDEC_STRIDE*4 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
ADD_DC mm2, mm3, r0 ADD_DC mm2, mm3, r0
ret RET
cglobal x264_add8x8_idct_dc_ssse3, 2,2 cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1] movq xmm0, [r1]
...@@ -324,7 +346,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2 ...@@ -324,7 +346,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps [r0+FDEC_STRIDE* 1], xmm3 movhps [r0+FDEC_STRIDE* 1], xmm3
movhps [r0+FDEC_STRIDE* 2], xmm4 movhps [r0+FDEC_STRIDE* 2], xmm4
movhps [r0+FDEC_STRIDE* 3], xmm5 movhps [r0+FDEC_STRIDE* 3], xmm5
ret RET
cglobal x264_add16x16_idct_dc_mmx, 2,3 cglobal x264_add16x16_idct_dc_mmx, 2,3
mov r2, 4 mov r2, 4
...@@ -348,7 +370,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3 ...@@ -348,7 +370,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
add r0, FDEC_STRIDE*4 add r0, FDEC_STRIDE*4
dec r2 dec r2
jg .loop jg .loop
ret REP_RET
%macro IDCT_DC_STORE 3 %macro IDCT_DC_STORE 3
movdqa xmm4, [r0+%1+FDEC_STRIDE*0] movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
...@@ -369,9 +391,13 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3 ...@@ -369,9 +391,13 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
movdqa [r0+%1+FDEC_STRIDE*3], xmm7 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro %endmacro
cglobal x264_add16x16_idct_dc_sse2, 2,2 cglobal x264_add16x16_idct_dc_sse2, 2,2,8
call .loop call .loop
add r0, FDEC_STRIDE*4 add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop: .loop:
add r0, FDEC_STRIDE*4 add r0, FDEC_STRIDE*4
movq xmm0, [r1+0] movq xmm0, [r1+0]
...@@ -399,9 +425,13 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2 ...@@ -399,9 +425,13 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2
IDCT_DC_STORE 0, xmm2, xmm3 IDCT_DC_STORE 0, xmm2, xmm3
ret ret
cglobal x264_add16x16_idct_dc_ssse3, 2,2 cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
call .loop call .loop
add r0, FDEC_STRIDE*4 add r0, FDEC_STRIDE*4
%ifdef WIN64
call .loop
RET
%endif
.loop: .loop:
add r0, FDEC_STRIDE*4 add r0, FDEC_STRIDE*4
movdqa xmm0, [r1] movdqa xmm0, [r1]
...@@ -428,7 +458,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2 ...@@ -428,7 +458,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro SCAN_8x8 1 %macro SCAN_8x8 1
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2 cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1] movdqa xmm0, [r1]
movdqa xmm1, [r1+16] movdqa xmm1, [r1+16]
movdq2q mm0, xmm0 movdq2q mm0, xmm0
...@@ -703,7 +733,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 ...@@ -703,7 +733,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
movd xmm0, [r1+0*FENC_STRIDE] movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE] movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE] movd xmm2, [r1+2*FENC_STRIDE]
......
...@@ -278,7 +278,7 @@ SECTION .text ...@@ -278,7 +278,7 @@ SECTION .text
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_XMM INIT_XMM
cglobal x264_deblock_v_luma_sse2 cglobal x264_deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0 movd m8, [r4] ; tc0
lea r4, [r1*3] lea r4, [r1*3]
dec r2d ; alpha-1 dec r2d ; alpha-1
...@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2 ...@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
DEBLOCK_P0_Q0 DEBLOCK_P0_Q0
mova [r4+2*r1], m1 mova [r4+2*r1], m1
mova [r0], m2 mova [r0], m2
ret RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX
cglobal x264_deblock_h_luma_sse2 cglobal x264_deblock_h_luma_sse2, 5,7
movsxd r10, esi movsxd r10, r1d
lea r11, [r10+r10*2] lea r11, [r10+r10*2]
lea rax, [r0-4] lea r6, [r0-4]
lea r9, [r0-4+r11] lea r5, [r0-4+r11]
%ifdef WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
sub rsp, 0x68 sub rsp, 0x68
%define pix_tmp rsp %define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space ; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea rax, [rax+r10*8] lea r6, [r6+r10*8]
lea r9, [r9 +r10*8] lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
; vertical filter ; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4 ; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30] lea r0, [pix_tmp+0x30]
mov esi, 0x10 mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
call x264_deblock_v_luma_sse2 call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add rax, 2 add r6, 2
add r9, 2 add r5, 2
movq m0, [pix_tmp+0x18] movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28] movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38] movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3 shl r10, 3
sub rax, r10 sub r6, r10
sub r9, r10 sub r5, r10
shr r10, 3 shr r10, 3
movq m0, [pix_tmp+0x10] movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
%else
add rsp, 0x68