Commit af34dfe3 authored by Fiona Glaser's avatar Fiona Glaser

SSE4 and SSSE3 versions of some intra_sad functions

Primarily Nehalem-optimized.
parent 5a57688f
......@@ -856,6 +856,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
/* Slower on Conroe, so only enable under SSE4 */
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
#endif //HAVE_MMX
......
......@@ -80,6 +80,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
......@@ -93,6 +94,7 @@ void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
......
......@@ -26,6 +26,19 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
h8x8_pred_shuf: times 8 db 1
times 8 db 0
times 8 db 3
times 8 db 2
times 8 db 5
times 8 db 4
times 8 db 7
times 8 db 6
SECTION .text
cextern pb_3
......@@ -303,6 +316,40 @@ cglobal intra_sad_x3_4x4_mmxext, 3,3
movd [r2+4], mm1 ;H prediction cost
RET
cglobal intra_sad_x3_4x4_sse4, 3,3
movd xmm4, [r1+FDEC_STRIDE*0-4]
pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
movd xmm2, [r1-FDEC_STRIDE]
pxor xmm3, xmm3
movdqa xmm5, xmm4
pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
pshufb xmm5, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
punpckldq xmm2, xmm4 ; ABCDEFGH
psadbw xmm2, xmm3
movd xmm1, [r0+FENC_STRIDE*0]
pinsrd xmm1, [r0+FENC_STRIDE*1], 1
pinsrd xmm1, [r0+FENC_STRIDE*2], 2
pinsrd xmm1, [r0+FENC_STRIDE*3], 3
psadbw xmm0, xmm1
psadbw xmm5, xmm1
psraw xmm2, 2
pavgw xmm2, xmm3
pshufb xmm2, xmm3 ; DC prediction
movdqa xmm3, xmm0
punpcklqdq xmm0, xmm5
punpckhqdq xmm3, xmm5
psadbw xmm2, xmm1
paddw xmm0, xmm3
movhlps xmm4, xmm2
packusdw xmm0, xmm0
paddw xmm2, xmm4
movq [r2], xmm0 ; V/H prediction costs
movd [r2+8], xmm2 ; DC prediction cost
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
;-----------------------------------------------------------------------------
......@@ -370,6 +417,66 @@ cglobal intra_sad_x3_8x8_mmxext, 3,3
movd [r2+8], m1
RET
INIT_XMM
cglobal intra_sad_x3_8x8_ssse3, 3,4,9
%ifdef PIC
lea r11, [h8x8_pred_shuf]
%define shuf r11
%else
%define shuf h8x8_pred_shuf
%endif
movq m0, [r1+7] ; left pixels
movq m1, [r1+16] ; top pixels
pxor m2, m2
pxor m3, m3
psadbw m2, m0
psadbw m3, m1
paddw m2, m3
pxor m3, m3 ; V score accumulator
psraw m2, 3
pavgw m2, m3
punpcklqdq m1, m1 ; V prediction
pshufb m2, m3 ; DC prediction
pxor m4, m4 ; H score accumulator
pxor m5, m5 ; DC score accumulator
mov r3d, 6
.loop:
movq m6, [r0+FENC_STRIDE*0]
movhps m6, [r0+FENC_STRIDE*1]
movdqa m7, m0
pshufb m7, [shuf+r3*8] ; H prediction
%ifdef ARCH_X86_64
movdqa m8, m1
psadbw m7, m6
psadbw m8, m6
psadbw m6, m2
paddw m4, m7
paddw m3, m8
paddw m5, m6
%else
psadbw m7, m6
paddw m4, m7
movdqa m7, m1
psadbw m7, m6
psadbw m6, m2
paddw m3, m7
paddw m5, m6
%endif
add r0, FENC_STRIDE*2
sub r3d, 2
jge .loop
movhlps m0, m3
movhlps m1, m4
movhlps m2, m5
paddw m3, m0
paddw m4, m1
paddw m5, m2
movd [r2+0], m3
movd [r2+4], m4
movd [r2+8], m5
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment