Forked from
VideoLAN / medialibrary
332 commits behind, 30 commits ahead of the upstream repository.
-
(cherry picked from commit 75a92a2e)
1981c4a0
Strings.cpp 5.99 KiB
/*****************************************************************************
* Media Library
*****************************************************************************
* Copyright (C) 2018-2019 Hugo Beauzée-Luyssen, Videolabs, VideoLAN
*
* Authors: Hugo Beauzée-Luyssen <hugo@beauzee.fr>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "Strings.h"
#include <algorithm>
#include <cassert>
#include <cctype>
#include <cstdint>
namespace medialibrary
{
namespace utils
{
namespace str
{
std::string trim( std::string value )
{
value.erase( begin( value ), std::find_if( begin( value ), end( value ), []( char c ) {
return isspace( c ) == false;
}));
value.erase( std::find_if( value.rbegin(), value.rend(), []( char c ) {
return isspace( c ) == false;
}).base(), value.end() );
return value;
}
namespace utf8
{
size_t nbChars( const std::string& value )
{
uint32_t nbChars = 0u;
for ( auto i = 0u; i < value.size(); )
{
if ( ( value[i] & 0x80 ) == 0 )
{
++i;
++nbChars;
}
else
{
uint8_t c = value[i];
/*
* Skip over the leading unicode byte, and skip an extra byte
* for each leading bit set to 1 in the first byte
*/
++i;
c <<= 1;
while ( ( c & 0x80 ) != 0 )
{
if ( i >= value.size() )
return 0;
uint8_t nextByte = value[i];
if ( ( nextByte & 0x80 ) != 0x80 )
return 0;
++i;
c <<= 1;
}
++nbChars;
}
}
return nbChars;
}
size_t nbBytes( const std::string& input, size_t offset, size_t nbChars )
{
if ( offset >= input.size() )
return 0;
size_t i = offset;
size_t res = 0;
while ( nbChars > 0 && i < input.size() )
{
/* Skip over non-terminal UTF8 bytes */
uint8_t c = input[i];
uint8_t currentCharNbBytes = 1;
if ( ( c & 0x80 ) != 0 )
{
c <<= 1;
++i;
while ( ( c & 0x80 ) != 0 )
{
if ( i >= input.size() )
return 0;
uint8_t nextByte = input[i];
if ( ( nextByte & 0x80 ) != 0x80 )
return 0;
++i;
currentCharNbBytes++;
c <<= 1;
}
}
else
++i;
res += currentCharNbBytes;
--nbChars;
}
return res;
}
std::string commonPattern( const std::string& lhs, size_t lhsOffset,
const std::string& rhs, size_t rhsOffset,
size_t minPatternSize )
{
auto lhsIdx = lhsOffset;
auto rhsIdx = rhsOffset;
auto patternSize = 0u;
while ( lhsIdx < lhs.size() && rhsIdx < rhs.size() )
{
/* For ascii, we can do case-insensitive comparison */
if ( ( lhs[lhsIdx] & 0x80 ) == 0 &&
( rhs[rhsIdx] & 0x80 ) == 0 )
{
if ( tolower( lhs[lhsIdx] ) != tolower( rhs[rhsIdx] ) )
break;
++lhsIdx;
++rhsIdx;
++patternSize;
}
else
{
/*
* We *need* to work with unsigned, be it for the bits operations
* or the comparisons
* Using std::string::operator[] without cast will return a signed char.
*/
uint8_t lhsC = lhs[lhsIdx];
if ( lhsC != static_cast<uint8_t>( rhs[rhsIdx] ) )
break;
lhsC <<= 1;
uint8_t multiByteOffset = 1;
while ( ( lhsC & 0x80 ) != 0 )
{
if ( lhsIdx + multiByteOffset >= lhs.size() ||
rhsIdx + multiByteOffset >= rhs.size() )
{
/*
* In case something is invalid or we reached the end of an
* input string, we don't want to return a potentially
* partial codepoint, so we reset the offset and break out
*/
multiByteOffset = 0;
break;
}
uint8_t nextByte = lhs[lhsIdx + multiByteOffset];
if ( ( nextByte & 0x80 ) != 0x80 ||
nextByte != static_cast<uint8_t>( rhs[rhsIdx + multiByteOffset] ) )
{
multiByteOffset = 0;
break;
}
lhsC <<= 1;
++multiByteOffset;
}
/*
* If the multi byte offset has been reset to 0, it means we either
* found an invalid codepoint, or a multi byte comparison failed
* in any of its bytes. In anycase, we don't want to return this
* partial code point
*/
if ( multiByteOffset == 0 )
break;
lhsIdx += multiByteOffset;
rhsIdx += multiByteOffset;
++patternSize;
}
}
if ( patternSize < minPatternSize )
return {};
return lhs.substr( lhsOffset, lhsIdx - lhsOffset );
}
}
}
}
}