Commit 311c4bb1 authored by Fiona Glaser's avatar Fiona Glaser

Deduplicate asm constants, automate name prefixing

Auto-prefix global constants with x264_ in cextern.
Eliminate x264_ prefix from asm files; automate it in cglobal.
Deduplicate asm constants wherever possible to save data cache (move them to a new const-a.asm).
Remove x264_emms() entirely on non-x86 (don't even call an empty function).
Add cextern_naked for a non-prefixed cextern (used in checkasm).
parent cca478ed
......@@ -50,8 +50,8 @@ endif
# MMX/SSE optims
ifneq ($(AS),)
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-a.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
......
......@@ -22,7 +22,6 @@
*****************************************************************************/
#include "common.h"
#include "cpu.h"
#include <stdarg.h>
#include <ctype.h>
......
......@@ -110,6 +110,7 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
#include "dct.h"
#include "cabac.h"
#include "quant.h"
#include "cpu.h"
/****************************************************************************
* General functions
......
......@@ -324,13 +324,6 @@ uint32_t x264_cpu_detect( void )
#endif
#ifndef HAVE_MMX
void x264_emms( void )
{
}
#endif
int x264_cpu_num_processors( void )
{
#if !defined(HAVE_PTHREAD)
......
......@@ -23,7 +23,14 @@
uint32_t x264_cpu_detect( void );
int x264_cpu_num_processors( void );
void x264_emms( void );
void x264_cpu_emms( void );
void x264_cpu_sfence( void );
#ifdef HAVE_MMX
#define x264_emms() x264_cpu_emms()
#else
#define x264_emms()
#endif
#define x264_sfence x264_cpu_sfence
void x264_cpu_mask_misalign_sse( void );
/* kluge:
......
......@@ -24,13 +24,11 @@
%include "x86inc.asm"
SECTION_RODATA
SECTION .text
cextern x264_cabac_range_lps
cextern x264_cabac_transition
cextern x264_cabac_renorm_shift
cextern cabac_range_lps
cextern cabac_transition
cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef WIN64
......@@ -70,7 +68,7 @@ endstruc
%endif
%endmacro
cglobal x264_cabac_encode_decision_asm, 0,7
cglobal cabac_encode_decision_asm, 0,7
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
......@@ -78,8 +76,8 @@ cglobal x264_cabac_encode_decision_asm, 0,7
mov t3d, t5d
shr t5d, 6
movifnidn t2d, r2m
LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t6*4
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
shr t6d, 6
sub t3d, t5d
cmp t6d, t2d
......@@ -88,17 +86,17 @@ cglobal x264_cabac_encode_decision_asm, 0,7
cmovne t3d, t5d
cmovne t6d, t7d
mov [t0+cb.state+t1], t4b
;x264_cabac_encode_renorm
;cabac_encode_renorm
mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
cmp t3d, 8
jl .update_queue_low
;x264_cabac_putbyte
;cabac_putbyte
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
DECLARE_REG_TMP 3,4,1,0,2,5,6,10
......
;*****************************************************************************
;* const-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2010 x264 project
;*
;* Author: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
SECTION_RODATA
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
const pb_a1, times 16 db 0xa1
const pb_1, times 16 db 1
const pb_3, times 16 db 3
const hsub_mul, times 8 db 1, -1
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_1, times 8 dw 1
const pw_2, times 8 dw 2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_16, times 8 dw 16
const pw_32, times 8 dw 32
const pw_64, times 8 dw 64
const pw_32_0, times 4 dw 32,
times 4 dw 0
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pd_1, times 4 dd 1
const pd_128, times 4 dd 128
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0
const sw_64, dd 64
......@@ -29,9 +29,9 @@ SECTION .text
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid, 5,7
cglobal cpu_cpuid, 5,7
push rbx
mov r11, r1
mov r10, r2
......@@ -49,10 +49,10 @@ cglobal x264_cpu_cpuid, 5,7
%else
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid_test( void )
; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid_test
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
......@@ -75,9 +75,9 @@ cglobal x264_cpu_cpuid_test
ret
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid, 0,6
cglobal cpu_cpuid, 0,6
mov eax, r0m
cpuid
mov esi, r1m
......@@ -91,9 +91,9 @@ cglobal x264_cpu_cpuid, 0,6
RET
;-----------------------------------------------------------------------------
; void x264_stack_align( void (*func)(void*), void *arg );
; void stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
cglobal x264_stack_align
cglobal stack_align
push ebp
mov ebp, esp
sub esp, 8
......@@ -110,16 +110,23 @@ cglobal x264_stack_align
%endif
;-----------------------------------------------------------------------------
; void x264_emms( void )
; void cpu_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
cglobal cpu_emms
emms
ret
;-----------------------------------------------------------------------------
; void x264_cpu_mask_misalign_sse(void)
; void cpu_sfence( void )
;-----------------------------------------------------------------------------
cglobal x264_cpu_mask_misalign_sse
cglobal cpu_sfence
sfence
ret
;-----------------------------------------------------------------------------
; void cpu_mask_misalign_sse( void )
;-----------------------------------------------------------------------------
cglobal cpu_mask_misalign_sse
sub rsp, 4
stmxcsr [rsp]
or dword [rsp], 1<<17
......
......@@ -27,13 +27,11 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pw_32: times 8 dw 32
hsub_mul: times 8 db 1, -1
SECTION .text
cextern pw_32
cextern hsub_mul
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
......@@ -188,10 +186,10 @@ dct8_mmx:
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
global x264_sub8x8_dct8_mmx.skip_prologue
cglobal sub8x8_dct8_mmx, 3,3
global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
......@@ -254,10 +252,10 @@ idct8_mmx:
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2
global x264_add8x8_idct8_mmx.skip_prologue
cglobal add8x8_idct8_mmx, 2,2
global add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
......@@ -344,9 +342,9 @@ global x264_add8x8_idct8_mmx.skip_prologue
INIT_XMM
%macro DCT_SUB8 1
cglobal x264_sub8x8_dct_%1, 3,3
cglobal sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
global x264_sub8x8_dct_%1.skip_prologue
global sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
mova m7, [hsub_mul]
......@@ -375,11 +373,11 @@ global x264_sub8x8_dct_%1.skip_prologue
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_%1, 3,3
cglobal sub8x8_dct8_%1, 3,3
add r2, 4*FDEC_STRIDE
global x264_sub8x8_dct8_%1.skip_prologue
global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
%ifidn %1, sse2
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
......@@ -419,11 +417,11 @@ DCT_SUB8 sse2
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct_sse2, 2,2
cglobal add8x8_idct_sse2, 2,2
add r0, 4*FDEC_STRIDE
global x264_add8x8_idct_sse2.skip_prologue
global add8x8_idct_sse2.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
......@@ -456,11 +454,11 @@ global x264_add8x8_idct_sse2.skip_prologue
ret
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
cglobal add8x8_idct8_sse2, 2,2
add r0, 4*FDEC_STRIDE
global x264_add8x8_idct8_sse2.skip_prologue
global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
......
......@@ -26,11 +26,10 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pw_32: times 8 dw 32
hsub_mul: times 8 db 1, -1
SECTION .text
cextern pw_32
cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
......@@ -140,7 +139,7 @@ INIT_XMM
%endmacro
%macro DCT_SUB8 1
cglobal x264_sub8x8_dct_%1, 3,3,11
cglobal sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
......@@ -149,7 +148,7 @@ cglobal x264_sub8x8_dct_%1, 3,3,11
call .skip_prologue
RET
%endif
global x264_sub8x8_dct_%1.skip_prologue
global sub8x8_dct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
......@@ -165,9 +164,9 @@ global x264_sub8x8_dct_%1.skip_prologue
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_%1, 3,3,11
cglobal sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
......@@ -176,7 +175,7 @@ cglobal x264_sub8x8_dct8_%1, 3,3,11
call .skip_prologue
RET
%endif
global x264_sub8x8_dct8_%1.skip_prologue
global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
......@@ -205,16 +204,16 @@ DCT_SUB8 sse2
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2,11
cglobal add8x8_idct8_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_add8x8_idct8_sse2.skip_prologue
global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
......@@ -237,16 +236,16 @@ global x264_add8x8_idct8_sse2.skip_prologue
ret
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct_sse2, 2,2,11
cglobal add8x8_idct_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_add8x8_idct_sse2.skip_prologue
global add8x8_idct_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
......
......@@ -35,12 +35,6 @@
%endmacro
SECTION_RODATA
pw_32_0: times 4 dw 32
times 4 dw 0
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
......@@ -48,11 +42,16 @@ pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 16 db 1
pw_1: times 8 dw 1
SECTION .text
cextern pw_32_0
cextern pw_32
cextern pw_8000
cextern hsub_mul
cextern pb_1
cextern pw_1
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
......@@ -73,9 +72,9 @@ SECTION .text
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
; void dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_dct4x4dc_mmx, 1,1
cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
......@@ -95,9 +94,9 @@ cglobal x264_dct4x4dc_mmx, 1,1
RET
;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_idct4x4dc_mmx, 1,1
cglobal idct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
......@@ -113,9 +112,9 @@ cglobal x264_idct4x4dc_mmx, 1,1
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub4x4_dct_%1, 3,3
cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
.skip_prologue:
LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
......@@ -140,9 +139,9 @@ SUB_DCT4 mmx
SUB_DCT4 ssse3
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add4x4_idct_mmx, 2,2
cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
......@@ -160,7 +159,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
RET
INIT_XMM
cglobal x264_add4x4_idct_sse4, 2,2,6
cglobal add4x4_idct_sse4, 2,2,6
mova m0, [r1+0x00] ; row1/row0
mova m2, [r1+0x10] ; row3/row2
mova m1, m0 ; row1/row0
......@@ -213,7 +212,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
......@@ -249,7 +248,7 @@ cglobal %1, 3,3,11
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
cglobal %1, 2,2,11
......@@ -280,33 +279,33 @@ cglobal %1, 2,2,11
%endmacro
%ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%endif
INIT_XMM
cextern x264_sub8x8_dct_sse2.skip_prologue
cextern x264_sub8x8_dct_ssse3.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
cextern x264_add8x8_idct_sse2.skip_prologue
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
cextern add8x8_idct_sse2.skip_prologue
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
cextern x264_sub8x8_dct8_sse2.skip_prologue
cextern x264_add8x8_idct8_sse2.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
cextern sub8x8_dct8_sse2.skip_prologue
cextern add8x8_idct8_sse2.skip_prologue
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
cextern x264_sub8x8_dct8_ssse3.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
cextern sub8x8_dct8_ssse3.skip_prologue
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
;-----------------------------------------------------------------------------
......@@ -331,7 +330,7 @@ SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 1
movq [%3+FDEC_STRIDE*3], %1
%endmacro
cglobal x264_add8x8_idct_dc_mmx, 2,2
cglobal add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
......@@ -350,7 +349,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
ADD_DC mm2, mm3, r0
RET
cglobal x264_add8x8_idct_dc_ssse3, 2,2
cglobal add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
......@@ -388,7 +387,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps [r0+FDEC_STRIDE* 3], xmm5
RET
cglobal x264_add16x16_idct_dc_mmx, 2,3
cglobal add16x16_idct_dc_mmx, 2,3
mov r2, 4
.loop:
movq mm0, [r1]
......@@ -431,7 +430,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
cglobal x264_add16x16_idct_dc_sse2, 2,2,8
cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
......@@ -465,7 +464,7 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
IDCT_DC_STORE 0, xmm2, xmm3
ret
cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
cglobal add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
......@@ -531,7 +530,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
%endmacro
INIT_MMX
cglobal x264_sub8x8_dct_dc_mmxext, 3,3
cglobal sub8x8_dct_dc_mmxext, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
......@@ -567,7 +566,7 @@ INIT_XMM
%endif
%endmacro
cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
cglobal sub8x8_dct_dc_sse2, 3,3,8
pxor m7, m7
DCTDC_2ROW_SSE2 0, 0, m4
DCTDC_2ROW_SSE2 2, 1, m4
......@@ -586,10 +585,10 @@ cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
cglobal zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
......@@ -703,9 +702,9 @@ SCAN_8x8 sse2
SCAN_8x8 ssse3
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
cglobal zigzag_scan_8x8_frame_mmxext, 2,2
movq mm0, [r1]
movq mm1, [r1+2*8]
movq mm2, [r1+2*14]
......@@ -798,9 +797,9 @@ cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
cglobal zigzag_scan_4x4_frame_mmx, 2,2
movq mm0, [r1]
movq mm1, [r1+8]
movq mm2, [r1+16]
......@@ -828,9 +827,9 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
cglobal zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
pshufb xmm1, [pb_scan4frameb]
......@@ -845,10 +844,10 @@ cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
cglobal zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
......@@ -862,7 +861,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
; Output order:
......@@ -875,7 +874,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
cglobal zigzag_scan_8x8_field_mmxext, 2,3
movq mm0, [r1+2*0] ; 03 02 01 00
movq mm1, [r1+2*4] ; 07 06 05 04
movq mm2, [r1+2*8] ; 11 10 09 08
......@@ -954,13 +953,13 @@ cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
%else
cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8