Commit dc454eab authored by Sam Hocevar's avatar Sam Hocevar
Browse files

* Additional fixes to the PIC versions of assembly routines. They now pass

    all checkasm tests and output streams are bit-by-bit identical, which
    sounds good.


git-svn-id: svn://svn.videolan.org/x264/trunk@422 df754926-b1dd-0310-bc7b-ec298dee348c
parent ac9da5db
......@@ -258,6 +258,14 @@ SECTION .text
%endif ;linux
; PIC support macros. On x86_64 we just use RIP-relative addressing, which is
; much simpler than the GOT handling we need to perform on x86.
;
; - GLOBAL should be used as a suffix for global addressing, eg.
; mov eax, [foo GLOBAL]
; instead of
; mov eax, [foo]
;
%ifdef __PIC__
%define GLOBAL wrt rip
%else
......
......@@ -154,14 +154,15 @@ ALIGN 16
; void __cdecl dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
x264_dct4x4dc_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
picpush ebx
picgetgot ebx
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
......@@ -185,7 +186,7 @@ x264_dct4x4dc_mmxext:
movq [eax+16], mm1
psraw mm3, 1
movq [eax+24], mm3
POP_EBX_IF_PIC
picpop ebx
ret
cglobal x264_idct4x4dc_mmxext
......@@ -272,9 +273,6 @@ ALIGN 16
; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
; Load dct coeffs
mov eax, [esp+12] ; dct
movq mm0, [eax+ 0]
......@@ -286,6 +284,9 @@ x264_add4x4_idct_mmxext:
mov ecx, [esp+ 8] ; i_dst
lea edx, [ecx+ecx*2]
picpush ebx
picgetgot ebx
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
......@@ -310,7 +311,7 @@ x264_add4x4_idct_mmxext:
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx]
POP_EBX_IF_PIC
picpop ebx
ret
......@@ -395,10 +396,11 @@ ALIGN 16
; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
mov eax, [esp+04] ; dest
picpush ebx
picgetgot ebx
movq mm5, [x264_mmx_PPNN GLOBAL]
movq mm6, [x264_mmx_PNNP GLOBAL]
movq mm4, [x264_mmx_PPPN GLOBAL]
......@@ -458,7 +460,7 @@ x264_xdct8_mmxext:
%assign disp disp+16
%endrep
POP_EBX_IF_PIC
picpop ebx
ret
ALIGN 16
......@@ -551,10 +553,11 @@ ALIGN 16
; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xidct8_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
mov eax, [esp+04] ; dest
picpush ebx
picgetgot ebx
movq mm4, [x264_mmx_PPNN GLOBAL]
movq mm5, [x264_mmx_PNPN GLOBAL]
movq mm6, [x264_mmx_PPNP GLOBAL]
......@@ -609,7 +612,7 @@ x264_xidct8_mmxext:
%assign disp disp+16
%endrep
POP_EBX_IF_PIC
picpop ebx
ret
ALIGN 16
......
......@@ -247,14 +247,14 @@ ALIGN 16
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
x264_deblock_v8_luma_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
push edi
push esi
mov edi, [esp+12] ; pix
mov esi, [esp+16] ; stride
mov edx, [esp+20] ; alpha
mov ecx, [esp+24] ; beta
mov edi, [picesp+12] ; pix
mov esi, [picesp+16] ; stride
mov edx, [picesp+20] ; alpha
mov ecx, [picesp+24] ; beta
dec edx
dec ecx
mov eax, edi
......@@ -269,7 +269,7 @@ x264_deblock_v8_luma_mmxext:
movq mm3, [edi+esi] ; q1
LOAD_MASK_MMX edx, ecx
mov ecx, [esp+44] ; tc0, use only the low 16 bits
mov ecx, [picesp+44] ; tc0, use only the low 16 bits
movd mm4, [ecx]
punpcklbw mm4, mm4
punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0]
......@@ -310,7 +310,7 @@ x264_deblock_v8_luma_mmxext:
add esp, 16
pop esi
pop edi
POP_EBX_IF_PIC
picpop ebx
ret
......@@ -430,7 +430,7 @@ x264_deblock_v_chroma_mmxext:
movd mm6, [ebx]
punpcklbw mm6, mm6
pand mm7, mm6
GET_GOT_IN_EBX_IF_PIC ; no need to push ebx, it's already been done
picgetgot ebx ; no need to push ebx, it's already been done
DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode
movq [eax+esi], mm1
......@@ -458,6 +458,7 @@ x264_deblock_h_chroma_mmxext:
movd mm6, [ebx]
punpcklbw mm6, mm6
pand mm7, mm6
picgetgot ebx ; no need to push ebx, it's already been done
DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode
movq mm0, [esp+8]
......@@ -501,8 +502,8 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_deblock_v_chroma_intra_mmxext:
CHROMA_V_START
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
movq mm0, [eax]
movq mm1, [eax+esi]
movq mm2, [edi]
......@@ -510,7 +511,7 @@ x264_deblock_v_chroma_intra_mmxext:
CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode
movq [eax+esi], mm1
movq [edi], mm2
POP_EBX_IF_PIC
picpop ebx
CHROMA_END
ALIGN 16
......@@ -519,13 +520,12 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_deblock_h_chroma_intra_mmxext:
CHROMA_H_START
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp)
CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode
TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp)
POP_EBX_IF_PIC
picpop ebx
pop ebp ; needed because of CHROMA_H_START
POP_EBX_IF_PIC
CHROMA_END
......@@ -35,28 +35,61 @@ BITS 32
%endif
%endmacro
; PIC support macros. All these macros are totally harmless when __PIC__ is
; not defined but can ruin everything if misused in PIC mode. On x86, shared
; objects cannot directly access global variables by address, they need to
; go through the GOT (global offset table). Most OSes do not care about it
; and let you load non-shared .so objects (Linux, Win32...). However, OS X
; requires PIC code in its .dylib objects.
;
; - GLOBAL should be used as a suffix for global addressing, eg.
; mov eax, [foo GLOBAL]
; instead of
; mov eax, [foo]
;
; - picgetgot computes the GOT address into the given register in PIC
; mode, otherwise does nothing. You need to do this before using GLOBAL.
;
; - picpush and picpop respectively push and pop the given register
; in PIC mode, otherwise do nothing. You should always use them around
; picgetgot except when sure that the register is no longer used and is
; being restored later by other means.
;
; - picesp is defined to compensate the changing of esp when pushing
; a register into the stack, eg.
; mov eax, [esp + 8]
; pushpic ebx
; mov eax, [picesp + 12]
; instead of
; mov eax, [esp + 8]
; pushpic ebx
; mov eax, [esp + 12]
;
%ifdef __PIC__
extern _GLOBAL_OFFSET_TABLE_
%define GLOBAL wrt ..gotpc
%macro GET_GOT_IN_EBX_IF_PIC 0
; FIXME: find an elegant way to use registers other than ebx
%define GLOBAL + ebx wrt ..gotoff
%macro picgetgot 1
call %%getgot
%%getgot:
pop ebx
add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
pop %1
add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
%endmacro
%macro PUSH_EBX_IF_PIC 0
push ebx
%macro picpush 1
push %1
%endmacro
%macro POP_EBX_IF_PIC 0
pop ebx
%macro picpop 1
pop %1
%endmacro
%define picesp esp+4
%else
%define GLOBAL
%macro GET_GOT_IN_EBX_IF_PIC 0
%macro picgetgot 1
%endmacro
%macro PUSH_EBX_IF_PIC 0
%macro picpush 1
%endmacro
%macro POP_EBX_IF_PIC 0
%macro picpop 1
%endmacro
%define picesp esp
%endif
......@@ -265,21 +265,21 @@ ALIGN 4
%macro BIWEIGHT_START_MMX 0
push edi
push esi
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
mov edi, [esp+12] ; dst
mov esi, [esp+16] ; i_dst
mov edx, [esp+20] ; src
mov ecx, [esp+24] ; i_src
pshufw mm4, [esp+28], 0 ; weight_dst
picpush ebx
picgetgot ebx
mov edi, [picesp+12] ; dst
mov esi, [picesp+16] ; i_dst
mov edx, [picesp+20] ; src
mov ecx, [picesp+24] ; i_src
pshufw mm4, [picesp+28], 0 ; weight_dst
movq mm5, [pw_64 GLOBAL]
psubw mm5, mm4 ; weight_src
movq mm6, [pw_32 GLOBAL] ; rounding
pxor mm7, mm7
%endmacro
%macro BIWEIGHT_END_MMX 0
POP_EBX_IF_PIC
picpop ebx
pop esi
pop edi
ret
......@@ -291,7 +291,7 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w16_mmxext:
BIWEIGHT_START_MMX
mov eax, [esp+32] ; i_height
mov eax, [picesp+32] ; i_height
ALIGN 4
.height_loop
......@@ -312,7 +312,7 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w8_mmxext:
BIWEIGHT_START_MMX
mov eax, [esp+32]
mov eax, [picesp+32]
ALIGN 4
.height_loop
......@@ -512,13 +512,13 @@ ALIGN 16
x264_mc_chroma_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
pxor mm3, mm3
pshufw mm5, [esp+20], 0 ; mm5 = dx
pshufw mm6, [esp+24], 0 ; mm6 = dy
pshufw mm5, [picesp+20], 0 ; mm5 = dx
pshufw mm6, [picesp+24], 0 ; mm6 = dy
movq mm4, [pw_8 GLOBAL]
movq mm0, mm4
......@@ -534,10 +534,10 @@ x264_mc_chroma_mmxext:
push edi
mov eax, [esp+4+4] ; src
mov edi, [esp+4+12] ; dst
mov ecx, [esp+4+8] ; i_src_stride
mov edx, [esp+4+32] ; i_height
mov eax, [picesp+4+4] ; src
mov edi, [picesp+4+12] ; dst
mov ecx, [picesp+4+8] ; i_src_stride
mov edx, [picesp+4+32] ; i_height
ALIGN 4
.height_loop
......@@ -568,22 +568,22 @@ ALIGN 4
movd [edi], mm0
add eax, ecx
add edi, [esp+4+16]
add edi, [picesp+4+16]
dec edx
jnz .height_loop
sub [esp+4+28], dword 8
sub [picesp+4+28], dword 8
jnz .finish ; width != 8 so assume 4
mov edi, [esp+4+12] ; dst
mov eax, [esp+4+4] ; src
mov edx, [esp+4+32] ; i_height
mov edi, [picesp+4+12] ; dst
mov eax, [picesp+4+4] ; src
mov edx, [picesp+4+32] ; i_height
add edi, 4
add eax, 4
jmp .height_loop
.finish
pop edi
POP_EBX_IF_PIC
picpop ebx
ret
......@@ -169,8 +169,8 @@ x264_center_filter_mmxext :
lea ebx, [ecx + ecx * 2] ; 3 * src_stride
lea edx, [ecx + ecx * 4] ; 5 * src_stride
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
pxor mm0, mm0 ; 0 ---> mm0
movq mm7, [mmx_dd_one GLOBAL] ; for rounding
......@@ -178,21 +178,23 @@ x264_center_filter_mmxext :
loopcy:
; mov eax, [esp + twidth]
; mov eax, [picesp + twidth]
xor eax, eax
mov edi, [esp + tdst1]
lea ebp, [esp + tbuffer]
mov esi, [esp + tsrc]
mov edi, [picesp + tdst1]
lea ebp, [picesp + tbuffer]
mov esi, [picesp + tsrc]
; Overwrite mm7, the value set above is never used
movd mm7, [mmx_dw_one GLOBAL]
picpop ebx
POP_EBX_IF_PIC
FILT_ALL esi
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
pshufw mm2, mm1, 0
movq [ebp + 8], mm1
movq [ebp], mm2
paddw mm1, [mmx_dw_one GLOBAL]
paddw mm1, mm7
psraw mm1, 5
packuswb mm1, mm1
......@@ -203,13 +205,10 @@ loopcy:
loopcx1:
POP_EBX_IF_PIC
FILT_ALL esi
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
movq [ebp + 2 * eax], mm1
paddw mm1, [mmx_dw_one GLOBAL]
paddw mm1, mm7
psraw mm1, 5
packuswb mm1, mm1
movd [edi + eax - 4], mm1
......@@ -219,15 +218,12 @@ loopcx1:
cmp eax, [esp + twidth]
jnz loopcx1
POP_EBX_IF_PIC
FILT_ALL esi
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
pshufw mm2, mm1, 7
movq [ebp + 2 * eax], mm1
movq [ebp + 2 * eax + 8], mm2
paddw mm1, [mmx_dw_one GLOBAL]
paddw mm1, mm7
psraw mm1, 5
packuswb mm1, mm1
movd [edi + eax - 4], mm1
......@@ -242,14 +238,17 @@ loopcx1:
mov edi, [esp + tdst2]
xor eax, eax
picpush ebx
picgetgot ebx
loopcx2:
movq mm2, [esp + 2 * eax + 2 + 4 + tbuffer]
movq mm3, [esp + 2 * eax + 4 + 4 + tbuffer]
movq mm4, [esp + 2 * eax + 6 + 4 + tbuffer]
movq mm5, [esp + 2 * eax + 8 + 4 + tbuffer]
movq mm1, [esp + 2 * eax + 4 + tbuffer]
movq mm6, [esp + 2 * eax + 10 + 4 + tbuffer]
movq mm2, [picesp + 2 * eax + 2 + 4 + tbuffer]
movq mm3, [picesp + 2 * eax + 4 + 4 + tbuffer]
movq mm4, [picesp + 2 * eax + 6 + 4 + tbuffer]
movq mm5, [picesp + 2 * eax + 8 + 4 + tbuffer]
movq mm1, [picesp + 2 * eax + 4 + tbuffer]
movq mm6, [picesp + 2 * eax + 10 + 4 + tbuffer]
paddw mm2, mm5
paddw mm3, mm4
paddw mm1, mm6
......@@ -287,19 +286,19 @@ loopcx2:
movd [edi + eax], mm2
add eax, 4
cmp eax, [esp + twidth]
cmp eax, [picesp + twidth]
jnz loopcx2
add edi, [esp + tdstp2]
mov [esp + tdst2], edi
add edi, [picesp + tdstp2]
mov [picesp + tdst2], edi
mov ebp, [esp + theight]
mov ebp, [picesp + theight]
dec ebp
test ebp, ebp
mov [esp + theight], ebp
mov [picesp + theight], ebp
jnz loopcy
POP_EBX_IF_PIC
picpop ebx
add esp, [esp + toffset]
......@@ -327,10 +326,10 @@ x264_horizontal_filter_mmxext :
mov esi, [esp + 20] ; src
pxor mm0, mm0
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
movq mm7, [mmx_dw_one GLOBAL]
POP_EBX_IF_PIC
picpop ebx
mov ecx, [esp + 32] ; height
......
......@@ -402,7 +402,7 @@ x264_pixel_ssd_16x8_sse2:
%macro SUM_MM_SSE2 2 ; sum junk
; ebx is no longer used at this point, so no push needed
GET_GOT_IN_EBX_IF_PIC
picgetgot ebx
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw %1, 1
movdqa %2, %1
......
......@@ -78,9 +78,9 @@ cglobal predict_16x16_dc_top_mmxext
%endmacro
%macro PRED8x8_LOAD_TOP 0
mov edx, [esp + 4]
mov ecx, [esp + 8]
mov eax, [esp +12]
mov edx, [picesp + 4]
mov ecx, [picesp + 8]
mov eax, [picesp +12]
sub edx, ecx
and eax, 12
......@@ -92,7 +92,7 @@ cglobal predict_16x16_dc_top_mmxext
mov al, [edx]
mov ah, [edx]
pinsrw mm1, ax, 0
mov eax, [esp +12]
mov eax, [picesp + 12]
.have_topleft:
and eax, byte 4
......@@ -113,8 +113,8 @@ cglobal predict_16x16_dc_top_mmxext
ALIGN 16
predict_8x8_v_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
PRED8x8_LOAD_TOP
lea eax, [ecx + 2*ecx]
......@@ -128,7 +128,7 @@ predict_8x8_v_mmxext:
movq [edx + eax], mm0 ; 5
movq [edx + 4*ecx], mm0 ; 6
POP_EBX_IF_PIC
picpop ebx
ret
;-----------------------------------------------------------------------------
......@@ -139,10 +139,10 @@ predict_8x8_v_mmxext:
ALIGN 16
predict_8x8_dc_core_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
mov eax, [esp + 16]
mov eax, [picesp + 16]
movq mm1, [eax-1]
movq mm2, [eax+1]
PRED8x8_LOWPASS mm4, [eax]
......@@ -169,7 +169,7 @@ predict_8x8_dc_core_mmxext:
movq [edx + eax], mm0 ; 5
movq [edx + 4*ecx], mm0 ; 6
POP_EBX_IF_PIC
picpop ebx
ret
;-----------------------------------------------------------------------------
......@@ -207,11 +207,11 @@ predict_8x8c_v_mmx :
ALIGN 16
predict_8x8c_dc_core_mmxext:
PUSH_EBX_IF_PIC
GET_GOT_IN_EBX_IF_PIC
picpush ebx
picgetgot ebx
mov edx, [esp + 4]
mov ecx, [esp + 8]
mov edx, [picesp + 4]
mov ecx, [picesp + 8]
sub edx, ecx
lea eax, [ecx + 2*ecx]
......@@ -223,8 +223,8 @@ predict_8x8c_dc_core_mmxext:
psadbw mm1, mm2 ; s1
psadbw mm0, mm2 ; s0
paddw mm0, [esp + 12]
pshufw mm2, [esp + 16], 0
paddw mm0, [picesp + 12]
pshufw mm2, [picesp + 16], 0
psrlw mm0, 3
paddw mm1, [pw_2 GLOBAL]
movq mm3, mm2
......@@ -248,7 +248,7 @@ predict_8x8c_dc_core_mmxext:
movq [edx + eax], mm2 ; 6
movq [edx + 4*ecx], mm2 ; 7
POP_EBX_IF_PIC
picpop ebx