Commit 109ae085 authored by Loren Merritt's avatar Loren Merritt

mmx deblocking.

2.5x faster deblocking functions, 1-4% overall.



git-svn-id: svn://svn.videolan.org/x264/trunk@341 df754926-b1dd-0310-bc7b-ec298dee348c
parent 16660150
......@@ -21,7 +21,8 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm \
common/i386/pixel-sse2.asm common/i386/quant-a.asm
common/i386/pixel-sse2.asm common/i386/quant-a.asm \
common/i386/deblock-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
......@@ -31,7 +32,8 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
common/amd64/pixel-sse2.asm common/amd64/quant-a.asm
common/amd64/pixel-sse2.asm common/amd64/quant-a.asm \
common/amd64/deblock-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
ASFLAGS += -Icommon/amd64
endif
......
;*****************************************************************************
;* deblock-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 64
%include "amd64inc.asm"
SECTION .rodata align=16
pb_01: times 16 db 0x01
pb_3f: times 16 db 0x3f
pb_ff: times 16 db 0xff
SECTION .text
cglobal x264_deblock_v_luma_sse2
cglobal x264_deblock_h_luma_sse2
cglobal x264_deblock_v_chroma_mmxext
cglobal x264_deblock_h_chroma_mmxext
cglobal x264_deblock_v_chroma_intra_mmxext
cglobal x264_deblock_h_chroma_intra_mmxext
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
; in: 8 rows of 4 bytes in %1..%8
; out: 4 rows of 8 bytes in mm0..mm3
%macro TRANSPOSE4x8_LOAD 8
movd mm0, %1
movd mm2, %2
movd mm1, %3
movd mm3, %4
punpcklbw mm0, mm2
punpcklbw mm1, mm3
movq mm2, mm0
punpcklwd mm0, mm1
punpckhwd mm2, mm1
movd mm4, %5
movd mm6, %6
movd mm5, %7
movd mm7, %8
punpcklbw mm4, mm6
punpcklbw mm5, mm7
movq mm6, mm4
punpcklwd mm4, mm5
punpckhwd mm6, mm5
movq mm1, mm0
movq mm3, mm2
punpckldq mm0, mm4
punpckhdq mm1, mm4
punpckldq mm2, mm6
punpckhdq mm3, mm6
%endmacro
; in: 4 rows of 8 bytes in mm0..mm3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4_STORE 8
movq mm4, mm0
movq mm5, mm1
movq mm6, mm2
punpckhdq mm4, mm4
punpckhdq mm5, mm5
punpckhdq mm6, mm6
punpcklbw mm0, mm1
punpcklbw mm2, mm3
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
movd %1, mm0
punpckhdq mm0, mm0
movd %2, mm0
movd %3, mm1
punpckhdq mm1, mm1
movd %4, mm1
punpckhdq mm3, mm3
punpcklbw mm4, mm5
punpcklbw mm6, mm3
movq mm5, mm4
punpcklwd mm4, mm6
punpckhwd mm5, mm6
movd %5, mm4
punpckhdq mm4, mm4
movd %6, mm4
movd %7, mm5
punpckhdq mm5, mm5
movd %8, mm5
%endmacro
%macro SBUTTERFLY 4
movq %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9
movq mm0, %1
movq mm1, %3
movq mm2, %5
movq mm3, %7
SBUTTERFLY bw, mm0, %2, mm4
SBUTTERFLY bw, mm1, %4, mm5
SBUTTERFLY bw, mm2, %6, mm6
movq [%9+0x10], mm5
SBUTTERFLY bw, mm3, %8, mm7
SBUTTERFLY wd, mm0, mm1, mm5
SBUTTERFLY wd, mm2, mm3, mm1
punpckhdq mm0, mm2
movq [%9+0x00], mm0
SBUTTERFLY wd, mm4, [%9+0x10], mm3
SBUTTERFLY wd, mm6, mm7, mm2
SBUTTERFLY dq, mm4, mm6, mm0
SBUTTERFLY dq, mm5, mm1, mm7
punpckldq mm3, mm2
movq [%9+0x10], mm5
movq [%9+0x20], mm7
movq [%9+0x30], mm4
movq [%9+0x40], mm0
movq [%9+0x50], mm3
%endmacro
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 6
mov%1 %6, %3
mov%1 %5, %2
psubusb %6, %2
psubusb %5, %3
por %5, %6
psubusb %5, %4
%endmacro
%macro DIFF_GT_MMX 5
DIFF_GT q, %1, %2, %3, %4, %5
%endmacro
%macro DIFF_GT_SSE2 5
DIFF_GT dqa, %1, %2, %3, %4, %5
%endmacro
; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
; out: mm5=beta-1, mm7=mask
; clobbers: mm4,mm6
%macro LOAD_MASK_MMX 2
movd mm4, %1
movd mm5, %2
pshufw mm4, mm4, 0
pshufw mm5, mm5, 0
packuswb mm4, mm4 ; 8x alpha-1
packuswb mm5, mm5 ; 8x beta-1
DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
por mm7, mm4
DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
por mm7, mm4
pxor mm6, mm6
pcmpeqb mm7, mm6
%endmacro
%macro LOAD_MASK_SSE2 2
movd xmm4, %1
movd xmm5, %2
pshuflw xmm4, xmm4, 0
pshuflw xmm5, xmm5, 0
punpcklqdq xmm4, xmm4
punpcklqdq xmm5, xmm5
packuswb xmm4, xmm4 ; 16x alpha-1
packuswb xmm5, xmm5 ; 16x beta-1
DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
por xmm7, xmm4
DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
por xmm7, xmm4
pxor xmm6, xmm6
pcmpeqb xmm7, xmm6
%endmacro
; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
; out: mm1=p0' mm2=q0'
; clobbers: mm0,3-6
%macro DEBLOCK_P0_Q0 2
; a = q0^p0^((p1-q1)>>2)
mov%1 %2m4, %2m0
psubb %2m4, %2m3
psrlw %2m4, 2
pxor %2m4, %2m1
pxor %2m4, %2m2
; b = p0^(q1>>2)
psrlw %2m3, 2
pand %2m3, [pb_3f GLOBAL]
mov%1 %2m5, %2m1
pxor %2m5, %2m3
; c = q0^(p1>>2)
psrlw %2m0, 2
pand %2m0, [pb_3f GLOBAL]
mov%1 %2m6, %2m2
pxor %2m6, %2m0
; d = (c^b) & ~(b^a) & 1
pxor %2m6, %2m5
pxor %2m5, %2m4
pandn %2m5, %2m6
pand %2m5, [pb_01 GLOBAL]
; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3
; = (avg(q0, p1>>2) + (d&a))
; - (avg(p0, q1>>2) + (d^(d&a)))
pavgb %2m0, %2m2
pand %2m4, %2m5
paddusb %2m0, %2m4
pavgb %2m3, %2m1
pxor %2m4, %2m5
paddusb %2m3, %2m4
; p0 += clip(delta, -tc0, tc0)
; q0 -= clip(delta, -tc0, tc0)
mov%1 %2m4, %2m0
psubusb %2m0, %2m3
psubusb %2m3, %2m4
pminub %2m0, %2m7
pminub %2m3, %2m7
paddusb %2m1, %2m0
paddusb %2m2, %2m3
psubusb %2m1, %2m3
psubusb %2m2, %2m0
%endmacro
%macro DEBLOCK_P0_Q0_MMX 0
DEBLOCK_P0_Q0 q, m
%endmacro
%macro DEBLOCK_P0_Q0_SSE2 0
DEBLOCK_P0_Q0 dqa, xm
%endmacro
; in: mm1=p0 mm2=q0
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1_SSE2 6
movdqa %6, xmm1
pavgb %6, xmm2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
movdqa %6, %1
psubusb %6, %5
paddusb %5, %1
pmaxub %2, %6
pminub %2, %5
movdqa %4, %2
%endmacro
SECTION .text
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
x264_deblock_v_luma_sse2:
; rdi = pix
movsxd rsi, esi ; stride
dec edx ; alpha-1
dec ecx ; beta-1
movd xmm8, [r8] ; tc0
mov r8, rdi
sub r8, rsi
sub r8, rsi
sub r8, rsi ; pix-3*stride
movdqa xmm0, [r8+rsi] ; p1
movdqa xmm1, [r8+2*rsi] ; p0
movdqa xmm2, [rdi] ; q0
movdqa xmm3, [rdi+rsi] ; q1
LOAD_MASK_SSE2 edx, ecx
punpcklbw xmm8, xmm8
punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
movdqa xmm9, [pb_ff GLOBAL]
pcmpeqb xmm9, xmm8
pandn xmm9, xmm7
pand xmm8, xmm9
movdqa xmm3, [r8] ; p2
DIFF_GT_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
pandn xmm6, xmm9
pcmpeqb xmm6, xmm9
pand xmm6, xmm9
movdqa xmm7, [pb_01 GLOBAL]
pand xmm7, xmm6
pand xmm6, xmm8
paddb xmm7, xmm8
LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
movdqa xmm4, [rdi+2*rsi] ; q2
DIFF_GT_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
pandn xmm6, xmm9
pcmpeqb xmm6, xmm9
pand xmm6, xmm9
pand xmm8, xmm6
pand xmm6, [pb_01 GLOBAL]
paddb xmm7, xmm6
movdqa xmm3, [rdi+rsi]
LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
DEBLOCK_P0_Q0_SSE2
movdqa [r8+2*rsi], xmm1
movdqa [rdi], xmm2
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
x264_deblock_h_luma_sse2:
movsxd r10, esi
lea r11, [r10+r10*2]
lea rax, [rdi-4]
lea r9, [rdi-4+r11]
%define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
lea rax, [rax+r10*8]
lea r9, [r9 +r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in edx, ecx, r8
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea rdi, [pix_tmp+0x30]
mov esi, 0x10
call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add rax, 2
add r9, 2
movq mm0, [pix_tmp+0x18]
movq mm1, [pix_tmp+0x28]
movq mm2, [pix_tmp+0x38]
movq mm3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
shl r10, 3
sub rax, r10
sub r9, r10
shr r10, 3
movq mm0, [pix_tmp+0x10]
movq mm1, [pix_tmp+0x20]
movq mm2, [pix_tmp+0x30]
movq mm3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
ret
%macro CHROMA_V_START 0
; rdi = pix
movsxd rsi, esi ; stride
dec edx ; alpha-1
dec ecx ; beta-1
mov rax, rdi
sub rax, rsi
sub rax, rsi
%endmacro
%macro CHROMA_H_START 0
movsxd rsi, esi
dec edx
dec ecx
sub rdi, 2
lea r9, [rsi+rsi*2]
mov rax, rdi
add rdi, r9
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
x264_deblock_v_chroma_mmxext:
CHROMA_V_START
movq mm0, [rax]
movq mm1, [rax+rsi]
movq mm2, [rdi]
movq mm3, [rdi+rsi]
LOAD_MASK_MMX edx, ecx
movd mm6, [r8] ; tc0
punpcklbw mm6, mm6
pand mm7, mm6
DEBLOCK_P0_Q0_MMX
movq [rax+rsi], mm1
movq [rdi], mm2
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
x264_deblock_h_chroma_mmxext:
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
movq [rsp-8], mm0
movq [rsp-16], mm3
LOAD_MASK_MMX edx, ecx
movd mm6, [r8] ; tc0
punpcklbw mm6, mm6
pand mm7, mm6
DEBLOCK_P0_Q0_MMX
movq mm0, [rsp-8]
movq mm3, [rsp-16]
TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
ret
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
movq mm4, %1
pxor mm4, %3
pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, mm4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
%macro CHROMA_INTRA_BODY 0
LOAD_MASK_MMX edx, ecx
movq mm5, mm1
movq mm6, mm2
CHROMA_INTRA_P0 mm1, mm0, mm3
CHROMA_INTRA_P0 mm2, mm3, mm0
psubb mm1, mm5
psubb mm2, mm6
pand mm1, mm7
pand mm2, mm7
paddb mm1, mm5
paddb mm2, mm6
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
x264_deblock_v_chroma_intra_mmxext:
CHROMA_V_START
movq mm0, [rax]
movq mm1, [rax+rsi]
movq mm2, [rdi]
movq mm3, [rdi+rsi]
CHROMA_INTRA_BODY
movq [rax+rsi], mm1
movq [rdi], mm2
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
x264_deblock_h_chroma_intra_mmxext:
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
CHROMA_INTRA_BODY
TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
ret
......@@ -497,6 +497,7 @@ struct x264_t
x264_dct_function_t dctf;
x264_csp_function_t csp;
x264_quant_function_t quantf;
x264_deblock_function_t loopf;
/* vlc table for decoding purpose only */
x264_vlc_table_t *x264_coeff_token_lookup[5];
......
......@@ -293,355 +293,191 @@ static inline int clip_uint8( int a )
return a;
}
static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
int i, d;
const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
const int alpha = i_alpha_table[i_index_a];
const int beta = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
for( i = 0; i < 4; i++ )
{
if( bS[i] == 0 )
{
pix += 4 * i_pix_stride;
for( i = 0; i < 4; i++ ) {
if( tc0[i] < 0 ) {
pix += 4*ystride;
continue;
}
if( bS[i] < 4 )
{
const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
/* 4px edge length */
for( d = 0; d < 4; d++ )
{
const int p0 = pix[-1];
const int p1 = pix[-2];
const int p2 = pix[-3];
const int q0 = pix[0];
const int q1 = pix[1];
const int q2 = pix[2];
if( abs( p0 - q0 ) < alpha &&
abs( p1 - p0 ) < beta &&
abs( q1 - q0 ) < beta )
{
int tc = tc0;
int i_delta;
if( abs( p2 - p0 ) < beta )
{
pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
tc++;
}
if( abs( q2 - q0 ) < beta )
{
pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
tc++;
}
i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
for( d = 0; d < 4; d++ ) {
const int p2 = pix[-3*xstride];
const int p1 = pix[-2*xstride];
const int p0 = pix[-1*xstride];
const int q0 = pix[ 0*xstride];
const int q1 = pix[ 1*xstride];
const int q2 = pix[ 2*xstride];
if( X264_ABS( p0 - q0 ) < alpha &&
X264_ABS( p1 - p0 ) < beta &&
X264_ABS( q1 - q0 ) < beta ) {
int tc = tc0[i];
int delta;
if( X264_ABS( p2 - p0 ) < beta ) {
pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
tc++;
}
pix += i_pix_stride;
}
}
else
{
/* 4px edge length */
for( d = 0; d < 4; d++ )
{
const int p0 = pix[-1];
const int p1 = pix[-2];
const int p2 = pix[-3];
const int q0 = pix[0];
const int q1 = pix[1];
const int q2 = pix[2];