Commit 29a164e9 authored by Hugo Beauzée-Luyssen's avatar Hugo Beauzée-Luyssen
Browse files

utils: Add a title parser

It tries to extract the season & episode number to decide whether the
media is a tvshow or not
Fix #63
parent 77f558f2
Pipeline #5450 passed with stages
in 13 minutes
......@@ -106,6 +106,7 @@ libmedialibrary_la_SOURCES = \
src/utils/ModificationsNotifier.cpp \
src/utils/Strings.cpp \
src/utils/Url.cpp \
src/utils/TitleAnalyzer.cpp \
$(NULL)
......@@ -168,6 +169,7 @@ noinst_HEADERS = \
src/utils/Strings.h \
src/utils/SWMRLock.h \
src/utils/Url.h \
src/utils/TitleAnalyzer.h \
src/VideoTrack.h \
src/Metadata.h \
src/compat/Thread.h \
......@@ -322,6 +324,7 @@ unittest_SOURCES = \
test/unittest/MiscTests.cpp \
test/unittest/ThumbnailTests.cpp \
test/unittest/SubtitleTrackTests.cpp \
test/unittest/TitleAnalyzerTests.cpp \
$(NULL)
EXTRA_DIST += test/unittest/db_v3.sql
......
/*****************************************************************************
* Media Library
*****************************************************************************
* Copyright (C) 2019 Hugo Beauzée-Luyssen, Videolabs, VideoLAN
*
* Authors: Hugo Beauzée-Luyssen<hugo@beauzee.fr>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#include "TitleAnalyzer.h"
#include <cctype>
#include <cstdio>
#include <cstring>
#include <cassert>
namespace medialibrary
{
namespace utils
{
namespace title
{
namespace
{
bool isSeparator( char c )
{
return c == '.' ||
c == ';' ||
c == '-' ||
c == '_' ||
c == ',' ||
c == '/' ||
c == '\\'||
c == '[' ||
c == ']' ||
c == '(' ||
c == ')';
}
bool isAnySeparator( char c )
{
return isspace( c ) || isSeparator( c );
}
}
std::tuple<bool, uint32_t, uint32_t> analyze( const std::string& title )
{
uint32_t seasonId = 0u;
auto seasonFound = false;
char* seasonEndPtr = nullptr;
uint32_t episodeId = 0u;
auto episodeFound = false;
uint32_t isolatedEpisode = 0u;
auto isolatedEpisodeFound = false;
auto i = 0u;
while ( i < title.length() && ( seasonFound == false || episodeFound == false ) )
{
const auto c = title[i];
// Handle S(season) or B(ook)
if ( ( c == 's' || c == 'S' || c == 'b' || c == 'B' ) &&
seasonFound == false && ( i == 0 || isAnySeparator( title[i - 1] ) ) )
{
if ( title[i + 1] == 0 )
return std::make_tuple( false, 0, 0 );
if ( strncasecmp( title.c_str() + i, "season", strlen( "season" ) ) == 0 )
i += strlen( "season" );
else if ( strncasecmp( title.c_str() + i, "book", strlen( "book" ) ) == 0 )
i += strlen( "book" );
else
++i;
char* endPtr = nullptr;
errno = 0;
// strtoul takes care of eating up prefixed whitespaces
seasonId = strtoul( title.c_str() + i, &endPtr, 10 );
if ( errno != 0 || endPtr == title.c_str() + i )
{
// Assume this is not a valid numeric value, but not an issue
// per-se. Otherwise we'd match any word starting with an S
// and return a failure.
continue;
}
seasonFound = true;
seasonEndPtr = endPtr;
i += ( endPtr - ( title.c_str() + i ) );
}
else if ( ( c == 'e' || c == 'E' ) && episodeFound == false )
{
// We want to avoid matching xxxxxe20 as an episode if it's part of
// a word. For now we only accept an e02 without a space before if
// it's stuck to a sXX
if ( i != 0 && isAnySeparator( title[i - 1] ) == false &&
title.c_str() + i != seasonEndPtr )
{
++i;
continue;
}
if ( title[i + 1] == 0 )
return std::make_tuple( false, 0, 0 );
// strtoul takes care of eating up prefixed whitespaces
++i;
char* endPtr = nullptr;
errno = 0;
episodeId = strtoul( title.c_str() + i, &endPtr, 10 );
if ( errno != 0 || endPtr == title.c_str() + i )
continue;
episodeFound = true;
i += ( endPtr - ( title.c_str() + i ) );
}
else if ( isdigit( c ) && ( i == 0 || isAnySeparator( title[i - 1] ) ) &&
seasonFound == false && episodeFound == false )
{
// Try to detect <season>x<episode>
if ( title[i + 1] == 0 )
break;
char* endPtr = nullptr;
errno = 0;
auto tmpSeasonId = strtoul( title.c_str() + i, &endPtr, 10 );
if ( errno != 0 || endPtr == ( title.c_str() + i ) )
{
// Not a <season>X<episode>, so jump over it and continue parsing
++i;
continue;
}
while ( isspace( *endPtr ) )
++endPtr;
if ( *endPtr != 'x' && *endPtr != 'X' )
{
++i;
continue;
}
++endPtr;
errno = 0;
auto oldEndPtr = endPtr;
auto tmpEpisodeId = strtoul( endPtr, &endPtr, 10 );
if ( errno != 0 || endPtr == oldEndPtr )
{
++i;
continue;
}
// Arbitrary limit above which we assume SSSSxEEEE to be a resolution
// and not a Season x Episode pattern
if ( tmpSeasonId < 379 )
{
seasonId = tmpSeasonId;
seasonFound = true;
episodeId = tmpEpisodeId;
episodeFound = true;
}
i += ( endPtr - ( title.c_str() + i ) );
}
// Try to match a number that would be given after a season
// for instance: /Season 2/01-
else if ( isdigit( c ) && seasonFound == true && episodeFound == false )
{
if ( seasonEndPtr == nullptr )
{
assert( false );
break;
}
const char* currentPos = title.c_str() + i - 1;
while ( currentPos > seasonEndPtr && isAnySeparator( *currentPos ) )
--currentPos;
// If this was not immediatly after the end of season information,
// don't go further
if ( currentPos != seasonEndPtr )
{
break;
}
char* endPtr;
errno = 0;
episodeId = strtoul( title.c_str() + i, &endPtr, 10 );
if ( errno != 0 || endPtr == title.c_str() + i )
continue;
episodeFound = true;
i += ( endPtr - ( title.c_str() + i ) );
}
// Attempt to match isolated number as an episode number without a season
// but don't assign it immediatly in case there is a better match later
else if ( isSeparator( c ) && isolatedEpisodeFound == false )
{
if ( title[i + 1] == 0 )
break;
++i;
while ( isAnySeparator( title[i] ) )
++i;
if ( isdigit( title[i] ) == 0 )
continue;
char* endPtr;
errno = 0;
isolatedEpisode = strtoul( title.c_str() + i, &endPtr, 10 );
if ( errno != 0 || endPtr == title.c_str() + i )
continue;
// Check that we have a separator afterward
auto j = endPtr - title.c_str();
while ( isspace( title[j] ) )
++j;
// shortcut in case we're reaching the end of the title
if ( title[j] == 0 )
{
isolatedEpisodeFound = true;
break;
}
if ( isSeparator( title[j] ) == false )
continue;
// Resume the loop after the separator, since we assume the entire
// <separator> <episode number> <separator> to have been processed
// and to not match another predicate from this loop.
i = j + 1;
isolatedEpisodeFound = true;
}
// Don't systematically increment i at the end of the loop to avoid
// jumping over the next token
else
++i;
}
if ( isolatedEpisodeFound == true && seasonFound == false )
return std::make_tuple( true, 0u, isolatedEpisode );
// Otherwise assume we matched an episode id as isolated but it wasn't isolated.
// This can happen with some patterns like Season XX/YY; sXX-YY-Something else
if ( isolatedEpisodeFound == true )
return std::make_tuple( true, seasonId, isolatedEpisode );
return std::make_tuple( seasonFound && episodeFound, seasonId, episodeId );
}
}
}
}
/*****************************************************************************
* Media Library
*****************************************************************************
* Copyright (C) 2019 Hugo Beauzée-Luyssen, Videolabs, VideoLAN
*
* Authors: Hugo Beauzée-Luyssen<hugo@beauzee.fr>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#pragma once
#include <tuple>
#include <string>
namespace medialibrary
{
namespace utils
{
namespace title
{
/**
* @brief analyze Tries to analyze a title and extract season/episode informations
* from it
* @param title The episode title
* @return A tuple containing:
* - A success boolean
* - The season number
* - The episode number
*/
std::tuple<bool, uint32_t, uint32_t> analyze( const std::string& title );
}
}
}
/*****************************************************************************
* Media Library
*****************************************************************************
* Copyright (C) 2019 Hugo Beauzée-Luyssen, Videolabs, VideoLAN
*
* Authors: Hugo Beauzée-Luyssen<hugo@beauzee.fr>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#if HAVE_CONFIG_H
# include "config.h"
#endif
#include "gtest/gtest.h"
#include "utils/TitleAnalyzer.h"
using namespace medialibrary;
#define check( s, e, exp, t ) \
do \
{ \
auto res = utils::title::analyze( t ); \
ASSERT_EQ( exp, std::get<0>( res ) ); \
if ( exp == false ) break; \
ASSERT_EQ( s, std::get<1>( res ) ); \
ASSERT_EQ( e, std::get<2>( res ) ); \
} \
while(0);
TEST( TitleAnalyzer, Simple )
{
check( 0u, 0u, false, "not exactly a correct title" );
check( 7u, 1u, true, "GoT S07e01.mkv" );
check( 7u, 1u, true, "GoT S 07 e 01.mkv" );
check( 7u, 1u, true, "GoT S 07 e 01.mkv" );
check( 3u, 4u, true, "Futurama 3x4.avi" );
check( 3u, 4u, true, "Futurama 03x04.avi" );
check( 3u, 4u, true, "Futurama 03x 04.avi" );
check( 3u, 4u, true, "Futurama 03 x 04.avi" );
check( 3u, 4u, true, "Futurama 03 x 04" );
check( 3u, 4u, true, "Futurama 03 x 04.avi" );
// Check that we don't confuse the parser with a 0x which doesn't mean hexadecimal
check( 0u, 4u, true, "Futurama 0x4.avi" );
check( 0u, 0u, false, "Super show S" );
check( 1u, 2u, true, "Fake03x02Season Number s01e00000002.mkv" );
check( 0u, 0u, false, "1230 929 12312878768567x test" );
check( 0u, 0u, false, "Note02 ane04 episode91 id" );
check( 0u, 0u, false, "s02 Note02 ane04 episode91 id" );
check( 3u, 9u, true, "This show starts with sssses s03e09" );
check( 1u, 2u, true, "Super tvshow.s01.e02" );
check( 1u, 2u, true, "Super tvshow.01x02" );
check( 6u, 7u ,true, "e07 S6 Reverse order" );
check( 6u, 7u ,true, "> e07 S6 Reverse order" );
check( 0u, 0u, false, "" );
check( 0u, 0u, false, " \t\t");
check( 0u, 0u, false, "s" );
check( 0u, 0u, false, "se" );
check( 0u, 0u, false, "e" );
check( 0u, 0u, false, "s01e" );
check( 0u, 0u, false, "The Overflow s01111111111111111111111e2222222222222222222222222222222222222222222222222" );
check( 1u, 1u, true, "Scrubs [1X01].avi" );
check( 1u, 1u, true, "Stranger.Things.S01E01.Chapter.One.The.Vanishing.Of.Will.Byers.720p.WebRip.x264-[MULVAcoded].mkv" );
check( 1u, 1u, true, "Sherlock.1x01.A.Study.In.Pink.720p.HDTV.x264-FoV.mkv" );
check( 2u, 1u, true, "The IT CRow/Season 2/01-The Work Outing.avi" );
check( 1u, 8u, true, "Orange.Is.The.New.Black.S01E08.720p.WEBRip.AAC2.0-Abjex.mkv" );
check( 10u, 1u, true, "Two.and.a.Half.Men.S10E01.HDTV.x264-LOL.mp4" );
check( 1u, 1u, true, "Avatar_The_Last_Airbender_B01E01_The_Boy_in_the_Iceberg.mkv" );
check( 1u, 1u, true, "Avatar_The_Last_Airbender_Book01E01_The_Boy_in_the_Iceberg.mkv" );
check( 12u, 34u, true, "Annoying patterns, season12/34" );
check( 56u, 78u, true, "Annoying patterns, s56-78-grmpf" );
check( 0u, 0u, false, "Babylon 5" );
check( 0u, 0u, false, "Blake's 7" );
check( 0u, 0u, false, "Car 54, Where are you?" );
check( 0u, 0u, false, "Don't Trust the B---- in Apartment 23" );
}
TEST( TitleAnalyzer, NoSeason )
{
check( 0u, 10u, true, "[PuyaSubs!] Uchuu Patrol Luluco - 10 [720p][967D0521]" );
check( 0u, 15u, true, "[Eclipse] Code Geass - Lelouch of the Rebellion R2 - 15 (1280x720 h264) [DCA806F7]" );
check( 0u, 0u, false, "007 Is actually a movie" );
check( 0u, 0u, false, "James Bond 007 is still just a movie" );
check( 0u, 150u, true, "Episode at the end - 150" );
check( 0u, 6u, true, "DBZ.-.006.-.Pour.une.victoire.définitive.avi" );
check( 0u, 390u, true, "[Ruffy] Detective Conan 424 [GER] - 390 [JAP][DVDRip][H.265][800x592].mkv" );
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment