Commit 346844af authored by Cleo Saulnier's avatar Cleo Saulnier Committed by Fiona Glaser
Browse files

MMX version of 8x8 interlaced zigzag

Just as fast as SSSE3 on Nehalem (and faster on Conroe/Penryn), so remove the SSSE3 version.
parent 6f221210
......@@ -720,12 +720,14 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
{
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
}
if( cpu&X264_CPU_SSSE3 )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_field_ssse3;
}
#endif
......
......@@ -866,56 +866,82 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
cglobal x264_zigzag_scan_8x8_field_ssse3, 2,4,8
movdqa xmm0, [r1+ 0] ; 0 1 2 3 4 5 6 7
movdqu xmm1, [r1+10] ; 5 6 7 8 9 10 11 12
movdqu xmm3, [r1+26] ; 13 14 15 16 17 18 19 20
movdqu xmm4, [r1+40] ; 20 21 22 23 24 25 26 27
movdqu xmm5, [r1+56] ; 28 29 30 31 32 33 34 35
movdqu xmm6, [r1+72] ; 36 37 38 39 40 41 42 43
movdqa xmm2, xmm1
movdqa xmm7, [pb_scan8fielde GLOBAL]
pshufb xmm0, [pb_scan8fielda GLOBAL] ; 0 1 2 _ _ 3 4 _
pshufb xmm1, [pb_scan8fieldb GLOBAL] ; _ _ _ 8 9 _ _ 10
por xmm0, xmm1
pshufb xmm2, [pb_scan8fieldc GLOBAL] ; _ 11 5 6 7 12 _ _
pshufb xmm3, [pb_scan8fieldd GLOBAL] ; 18 13 14 15 19 _ _ _
pshufb xmm4, xmm7 ; 26 20 21 22 23 27 _ _
pshufb xmm5, xmm7 ; 34 28 29 30 31 35 _ _
pshufb xmm6, xmm7 ; 42 36 37 38 39 43 _ _
movdqa [r0+ 0], xmm0
movdqa [r0+16], xmm2
movdqa [r0+32], xmm3
movdqu [r0+46], xmm4
movdqu [r0+62], xmm5
movdqu [r0+78], xmm6
movdqu xmm0, [r1+88] ; 44 45 46 47 48 49 50 51
movdqu xmm1, [r1+104] ; 52 53 54 55 56 57 58 59
movq xmm2, [r1+120] ; 60 61 62 63
pshufb xmm0, [pb_scan8fieldf GLOBAL] ; 49 50 44 45 46 47 51 _
pshufb xmm1, [pb_scan8fieldg GLOBAL] ; 56 57 52 53 54 55 58 59
movdqu [r0+90], xmm0
movdqu [r0+104], xmm1
movq [r0+120], xmm2
mov r2w, [r1+32]
mov r3w, [r1+34]
mov [r0+16], r2w
mov [r0+28], r3w
mov r2w, [r1+48]
mov r3w, [r1+50]
mov [r0+30], r2w
mov [r0+42], r3w
mov r2w, [r1+64]
mov r3w, [r1+66]
mov [r0+44], r2w
mov [r0+58], r3w
mov r2w, [r1+80]
mov r3w, [r1+82]
mov r1w, [r1+96]
mov [r0+60], r2w
mov [r0+74], r3w
mov [r0+76], r1w
cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
movq mm0, [r1+2*0] ; 03 02 01 00
movq mm1, [r1+2*4] ; 07 06 05 04
movq mm2, [r1+2*8] ; 11 10 09 08
pshufw mm3, mm0, 011111111b ; 03 03 03 03
movd r2, mm2 ; 09 08
pshufw mm2, mm2, 000111001b ; 08 11 10 09
punpcklwd mm3, mm1 ; 05 03 04 03
pinsrw mm0, r2, 3 ; 08 02 01 00
movq mm4, mm2
punpcklwd mm2, mm3 ; 04 10 03 09
pshufw mm2, mm2, 010110100b ; 10 04 03 09
movq [r0+2*0], mm0 ; 08 02 01 00
movq [r0+2*4], mm2 ; 10 04 03 09
movq mm3, [r1+2*12] ; 15 14 13 12
movq mm5, [r1+2*16] ; 19 18 17 16
punpckldq mm6, mm5 ; 17 16 XX XX
psrlq mm1, 16 ; XX 07 06 05
punpckhwd mm6, mm4 ; 08 17 11 16
punpckldq mm6, mm1 ; 06 05 11 16
movq [r0+2*8], mm6 ; 06 05 11 16
psrlq mm1, 16 ; XX XX 07 06
punpcklwd mm1, mm5 ; 17 07 16 06
movq mm0, [r1+2*20] ; 23 22 21 20
movq mm2, [r1+2*24] ; 27 26 25 24
movq mm6, mm3
punpckhdq mm1, mm1 ; 17 07 17 07
punpcklwd mm6, mm2 ; 25 13 24 12
pextrw r2, mm5, 2
movq [r0+2*24], mm0 ; 23 22 21 20
punpcklwd mm1, mm6 ; 24 17 12 07
movq [r0+2*12], mm1
pinsrw mm3, r2, 0 ; 15 14 13 18
movq [r0+2*16], mm3 ; 15 14 13 18
movq mm7, [r1+2*28]
movq mm0, [r1+2*32] ; 35 34 33 32
psrlq mm5, 48 ; XX XX XX 19
pshufw mm1, mm2, 011111001b ; 27 27 26 25
punpcklwd mm5, mm0 ; 33 XX 32 19
psrlq mm2, 48 ; XX XX XX 27
punpcklwd mm5, mm1 ; 26 32 25 19
movq [r0+2*32], mm7
movq [r0+2*20], mm5 ; 26 32 25 19
movq mm7, [r1+2*36]
movq mm1, [r1+2*40] ; 43 42 41 40
pshufw mm3, mm0, 011111001b ; 35 35 34 33
punpcklwd mm2, mm1 ; 41 XX 40 27
movq [r0+2*40], mm7
punpcklwd mm2, mm3 ; 34 40 33 27
movq [r0+2*28], mm2
movq mm7, [r1+2*44] ; 47 46 45 44
movq mm2, [r1+2*48] ; 51 50 49 48
psrlq mm0, 48 ; XX XX XX 35
punpcklwd mm0, mm2 ; 49 XX 48 35
pshufw mm3, mm1, 011111001b ; 43 43 42 41
punpcklwd mm0, mm3 ; 42 48 41 35
movq [r0+2*36], mm0
pextrw r2, mm2, 3 ; 51
psrlq mm1, 48 ; XX XX XX 43
punpcklwd mm1, mm7 ; 45 XX 44 43
psrlq mm2, 16 ; XX 51 50 49
punpcklwd mm1, mm2 ; 50 44 49 43
pshufw mm1, mm1, 010110100b ; 44 50 49 43
movq [r0+2*44], mm1
psrlq mm7, 16 ; XX 47 46 45
pinsrw mm7, r2, 3 ; 51 47 46 45
movq [r0+2*48], mm7
movq mm0, [r1+2*56] ; 59 58 57 56
movq mm1, [r1+2*52] ; 55 54 53 52
movq mm2, mm0
movq mm7, [r1+2*60]
punpckldq mm2, mm1 ; 53 52 57 56
punpckhdq mm1, mm0 ; 59 58 55 54
movq [r0+2*52], mm2
movq [r0+2*56], mm1
movq [r0+2*60], mm7
RET
;-----------------------------------------------------------------------------
......
......@@ -69,7 +69,7 @@ void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_ssse3 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment