Commit cb8b5dad authored by Loren Merritt's avatar Loren Merritt

simplify satd_sse2


git-svn-id: svn://svn.videolan.org/x264/trunk@578 df754926-b1dd-0310-bc7b-ec298dee348c
parent 04834f59
......@@ -4,6 +4,7 @@
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -55,6 +56,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_sse2
cglobal x264_pixel_ssim_4x4x2_core_sse2
cglobal x264_pixel_ssim_end4_sse2
%macro HADDD 2 ; sum junk
movhlps %2, %1
paddd %1, %2
......@@ -86,13 +88,6 @@ cglobal x264_pixel_ssim_end4_sse2
paddw xmm0, xmm3
%endmacro
%macro SAD_START_SSE2 0
; mov rdi, rdi ; pix1
movsxd rsi, esi ; stride1
; mov rdx, rdx ; pix2
movsxd rcx, ecx ; stride2
%endmacro
%macro SAD_END_SSE2 0
movdqa xmm1, xmm0
psrldq xmm0, 8
......@@ -106,7 +101,6 @@ ALIGN 16
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_16x16_sse2:
SAD_START_SSE2
movdqu xmm0, [rdx]
movdqu xmm1, [rdx+rcx]
lea rdx, [rdx+2*rcx]
......@@ -175,7 +169,6 @@ ALIGN 16
; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_16x8_sse2:
SAD_START_SSE2
pxor xmm0, xmm0
SAD_INC_4x16P_SSE2
SAD_INC_4x16P_SSE2
......@@ -216,19 +209,7 @@ x264_pixel_sad_16x8_sse2:
paddd xmm0, xmm3
%endmacro
%macro SSD_INC_8x16P_SSE2 0
SSD_INC_2x16P_SSE2
SSD_INC_2x16P_SSE2
SSD_INC_2x16P_SSE2
SSD_INC_2x16P_SSE2
%endmacro
%macro SSD_START_SSE2 0
; mov rdi, rdi ; pix1
movsxd rsi, esi ; stride1
; mov rdx, rdx ; pix2
movsxd rcx, ecx ; stride2
pxor xmm7, xmm7 ; zero
pxor xmm0, xmm0 ; mm0 holds the sum
%endmacro
......@@ -245,8 +226,9 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_pixel_ssd_16x16_sse2:
SSD_START_SSE2
SSD_INC_8x16P_SSE2
SSD_INC_8x16P_SSE2
%rep 8
SSD_INC_2x16P_SSE2
%endrep
SSD_END_SSE2
ALIGN 16
......@@ -255,118 +237,85 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_pixel_ssd_16x8_sse2:
SSD_START_SSE2
SSD_INC_8x16P_SSE2
%rep 4
SSD_INC_2x16P_SSE2
%endrep
SSD_END_SSE2
; %1=(row2, row0) %2=(row3, row1) %3=junk
; output in %1=(row3, row0) and %3=(row2, row1)
%macro HADAMARD4x4_SSE2 3
movdqa %3, %1
paddw %1, %2
psubw %3, %2
movdqa %2, %1
punpcklqdq %1, %3
punpckhqdq %2, %3
movdqa %3, %1
paddw %1, %2
psubw %3, %2
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
;;; two HADAMARD4x4_SSE2 running side-by-side
%macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
movdqa %3, %1
movdqa %6, %4
paddw %1, %2
paddw %4, %5
psubw %3, %2
psubw %6, %5
movdqa %2, %1
movdqa %5, %4
punpcklqdq %1, %3
punpcklqdq %4, %6
punpckhqdq %2, %3
punpckhqdq %5, %6
movdqa %3, %1
movdqa %6, %4
paddw %1, %2
paddw %4, %5
psubw %3, %2
psubw %6, %5
%macro HADAMARD1x4 4
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
movdqa %3, %1
punpcklwd %1, %2
punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
%macro HADAMARD1x8 8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
%endmacro
movdqa %3, %1
punpckldq %1, %2
punpckhdq %3, %2
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
punpckh%2 %5, %4
%endmacro
movdqa %2, %1
punpcklqdq %1, %3
punpckhqdq %2, %3
%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
mov%1 %5, %3
punpckh%2 %3, %4
punpckl%2 %5, %4
%endmacro
;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
movdqa %3, %1
movdqa %6, %4
punpcklwd %1, %2
punpcklwd %4, %5
punpckhwd %2, %3
punpckhwd %5, %6
movdqa %3, %1
movdqa %6, %4
punpckldq %1, %2
punpckldq %4, %5
punpckhdq %3, %2
punpckhdq %6, %5
movdqa %2, %1
movdqa %5, %4
punpcklqdq %1, %3
punpcklqdq %4, %6
punpckhqdq %2, %3
punpckhqdq %5, %6
%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
SBUTTERFLY dqa, dq, %1, %2, %5
SBUTTERFLY dqa, dq, %3, %4, %2
SBUTTERFLY dqa, qdq, %1, %3, %4
SBUTTERFLY dqa, qdq, %5, %2, %3
%endmacro
;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
;;; destroys xmm2, 3
;;; the value in xmm7 doesn't matter: it's only subtracted from itself
%macro LOAD4x8_DIFF_SSE2 0
movq xmm0, [rdi]
movq xmm4, [rdx]
punpcklbw xmm0, xmm7
punpcklbw xmm4, xmm7
psubw xmm0, xmm4
movq xmm1, [rdi+rsi]
movq xmm5, [rdx+rcx]
lea rdi, [rdi+2*rsi]
lea rdx, [rdx+2*rcx]
punpcklbw xmm1, xmm7
punpcklbw xmm5, xmm7
psubw xmm1, xmm5
movq xmm2, [rdi]
movq xmm4, [rdx]
punpcklbw xmm2, xmm7
punpcklbw xmm4, xmm7
psubw xmm2, xmm4
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm2 ; rows 0 and 2
punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
movq xmm3, [rdi+rsi]
movq xmm5, [rdx+rcx]
lea rdi, [rdi+2*rsi]
lea rdx, [rdx+2*rcx]
punpcklbw xmm3, xmm7
punpcklbw xmm5, xmm7
psubw xmm3, xmm5
movdqa xmm5, xmm1
punpcklqdq xmm1, xmm3 ; rows 1 and 3
punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
SBUTTERFLY dqa, wd, %1, %2, %5
SBUTTERFLY dqa, wd, %3, %4, %2
SBUTTERFLY dqa, dq, %1, %3, %4
SBUTTERFLY2 dqa, dq, %5, %2, %3
SBUTTERFLY dqa, qdq, %1, %3, %2
SBUTTERFLY2 dqa, qdq, %4, %5, %3
%endmacro
%macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
SBUTTERFLY dqa, wd, %1, %2, %9
SBUTTERFLY dqa, wd, %3, %4, %2
SBUTTERFLY dqa, wd, %5, %6, %4
SBUTTERFLY dqa, wd, %7, %8, %6
SBUTTERFLY dqa, dq, %1, %3, %8
SBUTTERFLY dqa, dq, %9, %2, %3
SBUTTERFLY dqa, dq, %5, %7, %2
SBUTTERFLY dqa, dq, %4, %6, %7
SBUTTERFLY dqa, qdq, %1, %5, %6
SBUTTERFLY dqa, qdq, %9, %4, %5
SBUTTERFLY dqa, qdq, %8, %2, %4
SBUTTERFLY dqa, qdq, %3, %7, %2
%endmacro
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
movq %1, %3
movq %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro SUM1x8_SSE2 3 ; 01 junk sum
......@@ -410,19 +359,23 @@ x264_pixel_ssd_16x8_sse2:
%endmacro
%macro SATD_TWO_SSE2 0
LOAD4x8_DIFF_SSE2
HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
LOAD_DIFF_8P xmm0, xmm4, [parm1q], [parm3q]
LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q], [parm3q+parm4q]
LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10], [parm3q+r11]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
%endmacro
%macro SATD_START 0
; mov rdi, rdi ; pix1
movsxd rsi, esi ; stride1
; mov rdx, rdx ; pix2
movsxd rcx, ecx ; stride2
pxor xmm6, xmm6
lea r10, [3*parm2q]
lea r11, [3*parm4q]
%endmacro
%macro SATD_END 0
......@@ -514,63 +467,6 @@ x264_pixel_satd_8x4_sse2:
SATD_END
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
movq %1, %3
movq %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
punpckh%2 %5, %4
%endmacro
%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
SBUTTERFLY dqa, dq, %1, %2, %5
SBUTTERFLY dqa, dq, %3, %4, %2
SBUTTERFLY dqa, qdq, %1, %3, %4
SBUTTERFLY dqa, qdq, %5, %2, %3
%endmacro
;-----------------------------------------------------------------------------
; input ABCDEFGH output AFHDTECB
;-----------------------------------------------------------------------------
%macro TRANSPOSE8x8 9
SBUTTERFLY dqa, wd, %1, %2, %9
SBUTTERFLY dqa, wd, %3, %4, %2
SBUTTERFLY dqa, wd, %5, %6, %4
SBUTTERFLY dqa, wd, %7, %8, %6
SBUTTERFLY dqa, dq, %1, %3, %8
SBUTTERFLY dqa, dq, %9, %2, %3
SBUTTERFLY dqa, dq, %5, %7, %2
SBUTTERFLY dqa, dq, %4, %6, %7
SBUTTERFLY dqa, qdq, %1, %5, %6
SBUTTERFLY dqa, qdq, %9, %4, %5
SBUTTERFLY dqa, qdq, %8, %2, %4
SBUTTERFLY dqa, qdq, %3, %7, %2
%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro HADAMARD1x8 8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
......
......@@ -4,6 +4,7 @@
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -57,20 +58,16 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2
cglobal x264_pixel_ssim_end4_sse2
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
punpckh%2 %5, %4
%endmacro
%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
SBUTTERFLY dqa, dq, %1, %2, %5
SBUTTERFLY dqa, dq, %3, %4, %2
SBUTTERFLY dqa, qdq, %1, %3, %4
SBUTTERFLY dqa, qdq, %5, %2, %3
%macro HADDW 2 ; sum junk
; ebx is no longer used at this point, so no push needed
picgetgot ebx
pmaddwd %1, [pw_1 GOT_ebx]
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
%endmacro
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [ecx]
movdqu xmm2, [ecx+edx]
......@@ -425,115 +422,56 @@ x264_pixel_ssd_16x8_sse2:
%endrep
SSD_END_SSE2
; %1=(row2, row0) %2=(row3, row1) %3=junk
; output in %1=(row3, row0) and %3=(row2, row1)
%macro HADAMARD4x4_SSE2 3
movdqa %3, %1
paddw %1, %2
psubw %3, %2
movdqa %2, %1
punpcklqdq %1, %3
punpckhqdq %2, %3
movdqa %3, %1
paddw %1, %2
psubw %3, %2
%endmacro
;;; two HADAMARD4x4_SSE2 running side-by-side
%macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
movdqa %3, %1
movdqa %6, %4
paddw %1, %2
paddw %4, %5
psubw %3, %2
psubw %6, %5
movdqa %2, %1
movdqa %5, %4
punpcklqdq %1, %3
punpcklqdq %4, %6
punpckhqdq %2, %3
punpckhqdq %5, %6
movdqa %3, %1
movdqa %6, %4
paddw %1, %2
paddw %4, %5
psubw %3, %2
psubw %6, %5
%endmacro
%macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
movdqa %3, %1
punpcklwd %1, %2
punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
movdqa %3, %1
punpckldq %1, %2
punpckhdq %3, %2
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro HADAMARD1x4 4
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
punpckh%2 %5, %4
%endmacro
%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
mov%1 %5, %3
punpckh%2 %3, %4
punpckl%2 %5, %4
%endmacro
movdqa %2, %1
punpcklqdq %1, %3
punpckhqdq %2, %3
%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
SBUTTERFLY dqa, dq, %1, %2, %5
SBUTTERFLY dqa, dq, %3, %4, %2
SBUTTERFLY dqa, qdq, %1, %3, %4
SBUTTERFLY dqa, qdq, %5, %2, %3
%endmacro
;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
movdqa %3, %1
movdqa %6, %4
punpcklwd %1, %2
punpcklwd %4, %5
punpckhwd %2, %3
punpckhwd %5, %6
movdqa %3, %1
movdqa %6, %4
punpckldq %1, %2
punpckldq %4, %5
punpckhdq %3, %2
punpckhdq %6, %5
movdqa %2, %1
movdqa %5, %4
punpcklqdq %1, %3
punpcklqdq %4, %6
punpckhqdq %2, %3
punpckhqdq %5, %6
%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
SBUTTERFLY dqa, wd, %1, %2, %5
SBUTTERFLY dqa, wd, %3, %4, %2
SBUTTERFLY dqa, dq, %1, %3, %4
SBUTTERFLY2 dqa, dq, %5, %2, %3
SBUTTERFLY dqa, qdq, %1, %3, %2
SBUTTERFLY2 dqa, qdq, %4, %5, %3
%endmacro
;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
;;; destroys xmm2, 3
;;; the value in xmm7 doesn't matter: it's only subtracted from itself
%macro LOAD4x8_DIFF_SSE2 0
movq xmm0, [eax]
movq xmm4, [ecx]
punpcklbw xmm0, xmm7
punpcklbw xmm4, xmm7
psubw xmm0, xmm4
movq xmm1, [eax+ebx]
movq xmm5, [ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
punpcklbw xmm1, xmm7
punpcklbw xmm5, xmm7
psubw xmm1, xmm5
movq xmm2, [eax]
movq xmm4, [ecx]
punpcklbw xmm2, xmm7
punpcklbw xmm4, xmm7
psubw xmm2, xmm4
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm2 ; rows 0 and 2
punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
movq xmm3, [eax+ebx]
movq xmm5, [ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
punpcklbw xmm3, xmm7
punpcklbw xmm5, xmm7
psubw xmm3, xmm5
movdqa xmm5, xmm1
punpcklqdq xmm1, xmm3 ; rows 1 and 3
punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
movq %1, %3
movq %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro SUM4x4_SSE2 4 ; 02 13 junk sum
......@@ -569,22 +507,20 @@ x264_pixel_ssd_16x8_sse2:
paddusw %7, %4
%endmacro
%macro HADDW 2 ; sum junk
; ebx is no longer used at this point, so no push needed
picgetgot ebx
pmaddwd %1, [pw_1 GOT_ebx]
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
%endmacro
%macro SATD_TWO_SSE2 0
LOAD4x8_DIFF_SSE2
HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
LOAD_DIFF_8P xmm0, xmm4, [eax], [ecx]
LOAD_DIFF_8P xmm1, xmm5, [eax+ebx], [ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
LOAD_DIFF_8P xmm2, xmm4, [eax], [ecx]
LOAD_DIFF_8P xmm3, xmm5, [eax+ebx], [ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
%endmacro
%macro SATD_START 0
......@@ -621,8 +557,8 @@ x264_pixel_satd_16x16_sse2:
mov eax, [esp+ 8]
mov ecx, [esp+16]
lea eax, [eax+8]
lea ecx, [ecx+8]
add eax, 8
add ecx, 8
SATD_TWO_SSE2
SATD_TWO_SSE2
......@@ -657,8 +593,8 @@ x264_pixel_satd_16x8_sse2:
mov eax, [esp+ 8]
mov ecx, [esp+16]
lea eax, [eax+8]
lea ecx, [ecx+8]
add eax, 8
add ecx, 8
SATD_TWO_SSE2
SATD_TWO_SSE2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment