Commit 07483f72 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 mbtree_propagate_list

Uses gathers and scatters in combination with conflict detections to
vectorize the scalar part.

Also improve the checkasm test to try different mb_y values and check
for out-of-bounds writes.
parent 1a88481b
......@@ -61,6 +61,7 @@ filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
......@@ -2432,6 +2433,112 @@ cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
jl .loop
RET
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
; uint16_t *lowres_costs, int bipred_weight, int mb_y,
; int width, int height, int stride, int list_mask );
;-----------------------------------------------------------------------------
INIT_ZMM avx512
cglobal mbtree_propagate_list_internal, 5,7,21
mova xm16, [pw_0xc000]
vpbroadcastw xm17, r5m ; bipred_weight << 9
vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT)
vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf]
vbroadcasti32x8 m6, [pd_0123]
vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
vbroadcasti128 m7, [pd_8]
vbroadcasti128 m8, [pw_31]
vbroadcasti128 m9, [pw_32]
psllw m10, m9, 4
pcmpeqw ym19, ym19 ; pw_m1
vpbroadcastw ym20, r7m ; width
psrld m11, m7, 3 ; pd_1
psrld m12, m8, 16 ; pd_31
vpbroadcastd m13, r8m ; height
vpbroadcastd m14, r9m ; stride
pslld m15, m14, 16
por m15, m11 ; {1, stride, 1, stride} ...
lea r4, [r4+2*r0] ; lowres_costs
lea r3, [r3+2*r0] ; propagate_amount
lea r2, [r2+4*r0] ; mvs
neg r0
mov r6d, 0x5555ffff
kmovd k4, r6d
kshiftrd k5, k4, 16 ; 0x5555
kshiftlw k6, k4, 8 ; 0xff00
.loop:
vbroadcasti128 ym1, [r4+2*r0]
mova xm4, [r3+2*r0]
vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3)
vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
vptestmw k1, ym1, ym18
vpermw m4, m5, m4
vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy}
psraw m0, m3, 5
paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
paddd m6, m7 ; i_mb_x += 8
pand m3, m8 ; {x, y}
vprold m1, m3, 20 ; {y, x} << 4
psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
pmullw m3, m1
paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
pslld ym1, ym0, 16
psubw ym1, ym19
vmovdqu16 ym1 {k5}, ym0
vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
kunpckwd k2, k2, k2
psrad m1, m0, 16
paddd m1 {k6}, m11
vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
pmaddwd m0, m15
paddd m0 {k6}, m14 ; idx0 | idx2
vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
; We're handling dwords, but the offsets are in words so there may be partial overlaps.
; We can work around this by handling dword-aligned and -unaligned offsets separately.
vptestmd k0, m0, m11
kandnw k2, k0, k1 ; dword-aligned offsets
kmovw k3, k2
vpgatherdd m3 {k2}, [r1+2*m0]
; If there are conflicts in the offsets we have to handle them before storing the results.
; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel
; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets.
vpconflictd m4, m0
vpbroadcastmw2d m1, k1
vptestmd k2, m1, m4
ktestw k2, k2
jz .no_conflicts
pand m1, m4 ; mask away unused offsets to avoid false positives
vplzcntd m1, m1
pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb
.conflict_loop:
vpermd m4 {k2}{z}, m1, m2
vpermd m1 {k2}, m1, m1 ; shift the index one step forward
paddsw m2, m4 ; add the weights of conflicting offsets
vpcmpd k2, m1, m12, 2
ktestw k2, k2
jnz .conflict_loop
.no_conflicts:
paddsw m3, m2
vpscatterdd [r1+2*m0] {k3}, m3
kandw k1, k0, k1 ; dword-unaligned offsets
kmovw k2, k1
vpgatherdd m1 {k1}, [r1+2*m0]
paddsw m1, m2 ; all conflicts have already been resolved
vpscatterdd [r1+2*m0] {k2}, m1
add r0, 8
jl .loop
RET
%endif
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
......
......@@ -553,6 +553,21 @@ PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
PROPAGATE_LIST(avx2)
#if ARCH_X86_64
void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
uint16_t *lowres_costs, int bipred_weight, int mb_y,
int width, int height, int stride, int list_mask );
static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list )
{
x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9,
mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride,
(1 << LOWRES_COST_SHIFT) << list );
}
#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
......@@ -880,4 +895,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->memzero_aligned = x264_memzero_aligned_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
#if ARCH_X86_64
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512;
#endif
}
......@@ -1765,15 +1765,16 @@ static int check_mc( int cpu_ref, int cpu_new )
h.mb.i_mb_width = width;
h.mb.i_mb_height = height;
uint16_t *ref_costsc = (uint16_t*)buf3;
uint16_t *ref_costsa = (uint16_t*)buf4;
int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
uint16_t *ref_costsc = (uint16_t*)buf3 + width;
uint16_t *ref_costsa = (uint16_t*)buf4 + width;
int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size);
int16_t *propagate_amount = (int16_t*)(mvs + width);
uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size);
int bipred_weight = (rand()%63)+1;
int mb_y = rand()&3;
int list = i&1;
for( int j = 0; j < size; j++ )
for( int j = -width; j < size+width; j++ )
ref_costsc[j] = ref_costsa[j] = rand()&32767;
for( int j = 0; j < width; j++ )
{
......@@ -1784,18 +1785,18 @@ static int check_mc( int cpu_ref, int cpu_new )
lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
}
call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
for( int j = 0; j < size && ok; j++ )
for( int j = -width; j < size+width && ok; j++ )
{
ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
if( !ok )
fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
}
call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment