Commit 90c07084 authored by François Cartegnie's avatar François Cartegnie 🤞
Browse files

packetizer: add SSE2 based AnnexB startcode helper

Improves even more from previous commit (by ~2x on 4K)
parent 953dd004
......@@ -20,15 +20,105 @@
#ifndef _STARTCODE_HELPER_H
#define _STARTCODE_HELPER_H 1
#include <vlc_cpu.h>
#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
#include <emmintrin.h>
#endif
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
* by using a 4 times faster trick than single byte lookup.
*
* That code is adapted from libav's ff_avc_find_startcode_internal
* by using a 4 times faster trick than single byte lookup. */
#define TRY_MATCH(p,a) {\
if (p[a+1] == 0) {\
if (p[a+0] == 0 && p[a+2] == 1)\
return a+p;\
if (p[a+2] == 0 && p[a+3] == 1)\
return a+p+1;\
}\
if (p[a+3] == 0) {\
if (p[a+2] == 0 && p[a+4] == 1)\
return a+p+2;\
if (p[a+4] == 0 && p[a+5] == 1)\
return a+p+3;\
}\
}
#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
__attribute__ ((__target__ ("sse2")))
static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end )
{
/* First align to 16 */
/* Skipping this step and doing unaligned loads isn't faster */
const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
for (end -= 3; p < alignedend && p < end; p++) {
if (p[0] == 0 && p[1] == 0 && p[2] == 1)
return p;
}
if( p == end )
return NULL;
alignedend = end - ((intptr_t) end & 15);
if( alignedend > p )
{
#ifdef CAN_COMPILE_SSE2
asm volatile(
"pxor %%xmm1, %%xmm1\n"
::: "xmm1"
);
#else
__m128i zeros = _mm_set1_epi8( 0x00 );
#endif
for( ; p < alignedend; p += 16)
{
uint32_t match;
#ifdef CAN_COMPILE_SSE2
asm volatile(
"movdqa 0(%[v]), %%xmm0\n"
"pcmpeqb %%xmm1, %%xmm0\n"
"pmovmskb %%xmm0, %[match]\n"
: [match]"=r"(match)
: [v]"r"(p)
: "xmm0"
);
#else
__m128i v = _mm_load_si128((__m128i*)p);
__m128i res = _mm_cmpeq_epi8( zeros, v );
match = _mm_movemask_epi8( res ); /* mask will be in reversed match order */
#endif
if( match & 0x000F )
TRY_MATCH(p, 0);
if( match & 0x00F0 )
TRY_MATCH(p, 4);
if( match & 0x0F00 )
TRY_MATCH(p, 8);
if( match & 0xF000 )
TRY_MATCH(p, 12);
}
}
for (; p < end; p++) {
if (p[0] == 0 && p[1] == 0 && p[2] == 1)
return p;
}
return NULL;
}
#endif
/* That code is adapted from libav's ff_avc_find_startcode_internal
* and i believe the trick originated from
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
*/
static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
{
#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
if (vlc_CPU_SSE2())
return startcode_FindAnnexB_SSE2(p, end);
#endif
const uint8_t *a = p + 4 - ((intptr_t)p & 3);
for (end -= 3; p < a && p < end; p++) {
......@@ -41,18 +131,7 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
if ((x - 0x01010101) & (~x) & 0x80808080)
{
/* matching DW isn't faster */
if (p[1] == 0) {
if (p[0] == 0 && p[2] == 1)
return p;
if (p[2] == 0 && p[3] == 1)
return p+1;
}
if (p[3] == 0) {
if (p[2] == 0 && p[4] == 1)
return p+2;
if (p[4] == 0 && p[5] == 1)
return p+3;
}
TRY_MATCH(p, 0);
}
}
......@@ -64,4 +143,6 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
return NULL;
}
#undef TRY_MATCH
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment