Forked from
VideoLAN / medialibrary
332 commits behind, 30 commits ahead of the upstream repository.
(cherry picked from commit 75a92a2e)
Strings.cpp 5.99 KiB
* Media Library
* Copyright (C) 2018-2019 Hugo Beauzée-Luyssen, Videolabs, VideoLAN
* Authors: Hugo Beauzée-Luyssen <>
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
# include "config.h"
#include "Strings.h"
#include <algorithm>
#include <cassert>
#include <cctype>
#include <cstdint>
namespace medialibrary
namespace utils
namespace str
std::string trim( std::string value )
value.erase( begin( value ), std::find_if( begin( value ), end( value ), []( char c ) {
return isspace( c ) == false;
value.erase( std::find_if( value.rbegin(), value.rend(), []( char c ) {
return isspace( c ) == false;
}).base(), value.end() );
return value;
namespace utf8
size_t nbChars( const std::string& value )
uint32_t nbChars = 0u;
for ( auto i = 0u; i < value.size(); )
if ( ( value[i] & 0x80 ) == 0 )
uint8_t c = value[i];
* Skip over the leading unicode byte, and skip an extra byte
* for each leading bit set to 1 in the first byte
c <<= 1;
while ( ( c & 0x80 ) != 0 )
if ( i >= value.size() )
return 0;
uint8_t nextByte = value[i];
if ( ( nextByte & 0x80 ) != 0x80 )
return 0;
c <<= 1;
return nbChars;
size_t nbBytes( const std::string& input, size_t offset, size_t nbChars )
if ( offset >= input.size() )
return 0;
size_t i = offset;
size_t res = 0;
while ( nbChars > 0 && i < input.size() )
/* Skip over non-terminal UTF8 bytes */
uint8_t c = input[i];
uint8_t currentCharNbBytes = 1;
if ( ( c & 0x80 ) != 0 )
c <<= 1;
while ( ( c & 0x80 ) != 0 )
if ( i >= input.size() )
return 0;
uint8_t nextByte = input[i];
if ( ( nextByte & 0x80 ) != 0x80 )
return 0;
c <<= 1;
res += currentCharNbBytes;
return res;
std::string commonPattern( const std::string& lhs, size_t lhsOffset,
const std::string& rhs, size_t rhsOffset,
size_t minPatternSize )
auto lhsIdx = lhsOffset;
auto rhsIdx = rhsOffset;
auto patternSize = 0u;
while ( lhsIdx < lhs.size() && rhsIdx < rhs.size() )
/* For ascii, we can do case-insensitive comparison */
if ( ( lhs[lhsIdx] & 0x80 ) == 0 &&
( rhs[rhsIdx] & 0x80 ) == 0 )
if ( tolower( lhs[lhsIdx] ) != tolower( rhs[rhsIdx] ) )
* We *need* to work with unsigned, be it for the bits operations
* or the comparisons
* Using std::string::operator[] without cast will return a signed char.
uint8_t lhsC = lhs[lhsIdx];
if ( lhsC != static_cast<uint8_t>( rhs[rhsIdx] ) )
lhsC <<= 1;
uint8_t multiByteOffset = 1;
while ( ( lhsC & 0x80 ) != 0 )
if ( lhsIdx + multiByteOffset >= lhs.size() ||
rhsIdx + multiByteOffset >= rhs.size() )
* In case something is invalid or we reached the end of an
* input string, we don't want to return a potentially
* partial codepoint, so we reset the offset and break out
multiByteOffset = 0;
uint8_t nextByte = lhs[lhsIdx + multiByteOffset];
if ( ( nextByte & 0x80 ) != 0x80 ||
nextByte != static_cast<uint8_t>( rhs[rhsIdx + multiByteOffset] ) )
multiByteOffset = 0;
lhsC <<= 1;
* If the multi byte offset has been reset to 0, it means we either
* found an invalid codepoint, or a multi byte comparison failed
* in any of its bytes. In anycase, we don't want to return this
* partial code point
if ( multiByteOffset == 0 )
lhsIdx += multiByteOffset;
rhsIdx += multiByteOffset;
if ( patternSize < minPatternSize )
return {};
return lhs.substr( lhsOffset, lhsIdx - lhsOffset );