Skip to content
Snippets Groups Projects
Commit 86f9e53b authored by Lyndon Brown's avatar Lyndon Brown Committed by Jean-Baptiste Kempf
Browse files

cmdline: add jaro-winkler string measuring implementation

to be used with suggestion matching.

this implementation is based upon the implementation from the `strsim` Rust
crate, authored by Danny Guo, available at [1]; more specifically the (as
yet un-merged) optimised copy authored by myself, available at [2]. the
code is available under the MIT license.

one implementation difference is that we use floats rather than doubles
since we only intend to use this for suggestion matching in unknown option
error messages and we don't need the extra precision.

[1]: https://github.com/dguo/strsim-rs
[2]: https://github.com/dguo/strsim-rs/pull/31
parent 8fb6f8d0
No related branches found
No related tags found
No related merge requests found
......@@ -213,6 +213,8 @@ libvlccore_la_SOURCES = \
config/cmdline.c \
config/getopt.c \
config/vlc_getopt.h \
config/jaro_winkler.c \
config/vlc_jaro_winkler.h \
extras/libc.c \
media_source/media_source.c \
media_source/media_source.h \
......@@ -596,6 +598,7 @@ check_PROGRAMS = \
test_executor \
test_i18n_atof \
test_interrupt \
test_jaro_winkler \
test_list \
test_md5 \
test_picture_pool \
......@@ -626,6 +629,7 @@ test_executor_SOURCES = test/executor.c
test_i18n_atof_SOURCES = test/i18n_atof.c
test_interrupt_SOURCES = test/interrupt.c
test_interrupt_LDADD = $(LDADD) $(LIBS_libvlccore)
test_jaro_winkler_SOURCES = test/jaro_winkler.c config/jaro_winkler.c
test_list_SOURCES = test/list.c
test_md5_SOURCES = test/md5.c
test_picture_pool_SOURCES = test/picture_pool.c
......
/*****************************************************************************
* jaro_winkler.c: jaro winkler string similarity algorithm implementation
*****************************************************************************
* Copyright 2015 Danny Guo
* Copyright 2018 Lyndon Brown
*
* Authors: Danny Guo <dguo@users.noreply.github.com>
* Lyndon Brown <jnqnfe@gmail.com>
*
* Licensed under the MIT license. You may not copy, modify, or distribute this
* file except in compliance with said license. You can find a copy of this
* license either in the LICENSE file, or alternatively at
* <http://opensource.org/licenses/MIT>.
*****************************************************************************
* This file is based upon the Jaro Winkler implementation of the `strsim`
* Rust crate, authored by Danny Guo, available at
* <https://github.com/dguo/strsim-rs>; more specifically the (as yet un-merged)
* optimised copy authored by myself (Lyndon Brown), available at
* <https://github.com/dguo/strsim-rs/pull/31>. The code is available under the
* MIT license.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <sys/types.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include "vlc_jaro_winkler.h"
#define MAX(a, b) ( ((a) > (b)) ? (a) : (b) )
#define MIN(a, b) ( ((a) < (b)) ? (a) : (b) )
/**
* Checks both strings for a common prefix, returning the number of matching
* bytes.
*/
static inline size_t split_on_common_prefix(const char *a, const char *b) {
size_t len = 0;
while (*(a) && *(b) && *(a++) == *(b++)) len++;
return len;
}
/**
* This is the inner Jaro algorithm, with a parameter for passing back the
* length of the prefix common to both strings, used for efficiency of the
* Jaro-Winkler implementation.
*/
static inline int jaro_inner(const char *a, const char *b, size_t *ret_prefix_cc, float* res) {
assert(a && b && ret_prefix_cc && res);
if ((a[0] == '\0') ^ (b[0] == '\0')) {
*res = 0.0;
return 0;
}
size_t prefix_char_count = split_on_common_prefix(a, b);
const char *a_suffix = a + prefix_char_count;
const char *b_suffix = b + prefix_char_count;
if (a_suffix[0] == '\0' && b_suffix[0] == '\0') {
*res = 1.0;
return 0;
}
*ret_prefix_cc = prefix_char_count;
size_t a_numchars = strlen(a_suffix) + prefix_char_count;
size_t b_numchars = strlen(b_suffix) + prefix_char_count;
// The check for lengths of one here is to prevent integer overflow when
// calculating the search range.
if (a_numchars == 1 && b_numchars == 1) {
*res = 0.0;
return 0;
}
size_t search_range = (MAX(a_numchars, b_numchars) / 2) - 1;
/* catch overflow */
assert(a_numchars <= INT_MAX);
assert(search_range <= INT_MAX);
bool *b_consumed = calloc(b_numchars, sizeof(*b_consumed));
if (!b_consumed) {
*res = 0.0;
return -1;
}
size_t matches = prefix_char_count;
size_t transpositions = 0;
size_t b_match_index = 0;
const char *a_char = a_suffix;
for (size_t i = 0; *a_char; i++) {
ssize_t tmp = (ssize_t)i - (ssize_t)search_range;
size_t bound_start = (tmp >= 0) ? tmp : 0;
size_t bound_end = MIN(b_numchars, i + search_range + 1);
if (bound_start >= bound_end) {
a_char++;
continue;
}
const char *b_char = b_suffix + bound_start;
for (size_t j = bound_start; *b_char && j < bound_end; j++) {
if (*a_char == *b_char && !b_consumed[j]) {
b_consumed[j] = true;
matches++;
if (j < b_match_index) {
transpositions++;
}
b_match_index = j;
break;
}
b_char++;
}
a_char++;
}
if (matches == 0) {
*res = 0.0;
return 0;
}
*res = (1.0 / 3.0) *
(((float)matches / (float)a_numchars) +
((float)matches / (float)b_numchars) +
(((float)matches - (float)transpositions) / (float)matches));
return 0;
}
/**
* Calculate a “Jaro Winkler” metric.
*
* Algorithm: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
*
* Like “Jaro” but gives a boost to strings that have a common prefix.
*
* \note: This implementation does not place a limit on the common prefix
* length adjusted for.
*
* \param a string A
* \param b string B
* \param res [OUT] a pointer to a float to receive the result
* \return -1 on memory allocation failure, otherwise 0
*/
int vlc_jaro_winkler(const char *a, const char *b, float* res) {
size_t prefix_char_count = 0;
float jaro_distance;
if (jaro_inner(a, b, &prefix_char_count, &jaro_distance) != 0) {
return -1;
}
float jaro_winkler_distance =
jaro_distance + (0.1 * (float)prefix_char_count * (1.0 - jaro_distance));
*res = (jaro_winkler_distance <= 1.0) ? jaro_winkler_distance : 1.0;
return 0;
}
/*****************************************************************************
* jaro_winkler.c: jaro winkler string similarity algorithm implementation
*****************************************************************************
* Copyright 2015 Danny Guo
* Copyright 2018, 2019 Lyndon Brown
*
* Authors: Danny Guo <dguo@users.noreply.github.com>
* Lyndon Brown <jnqnfe@gmail.com>
*
* Licensed under the MIT license. You may not copy, modify, or distribute this
* file except in compliance with said license. You can find a copy of this
* license either in the LICENSE file, or alternatively at
* <http://opensource.org/licenses/MIT>.
*****************************************************************************
* This file is based upon the Jaro Winkler implementation of the `strsim`
* Rust crate, authored by Danny Guo, available at
* <https://github.com/dguo/strsim-rs>; more specifically the (as yet un-merged)
* optimised copy authored by myself (Lyndon Brown), available at
* <https://github.com/dguo/strsim-rs/pull/31>. The code is available under the
* MIT license.
*****************************************************************************/
#ifndef VLC_JARO_WINKLER_H
#define VLC_JARO_WINKLER_H 1
/**
* Calculate a “Jaro Winkler” metric.
*
* Algorithm: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
*
* Like “Jaro” but gives a boost to strings that have a common prefix.
*
* \note: This implementation does not place a limit the common prefix length
* adjusted for.
*
* \param a string A
* \param b string B
* \param res [OUT] a pointer to a float to receive the result
* \return -1 on memory allocation failure, otherwise 0
*/
int vlc_jaro_winkler(const char *a, const char *b, float *res);
#endif
/*****************************************************************************
* jaro_winkler.c: Tests for our Jaro Winkler algorithm
*****************************************************************************
* Copyright 2015 Danny Guo
* Copyright 2018 Lyndon Brown
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h> /* fabs() */
#include <vlc_common.h>
#include <vlc_strings.h>
#include "../config/vlc_jaro_winkler.h"
const char vlc_module_name[] = "test_jarowinkler";
# define test1( expected, a, b ) \
assert(vlc_jaro_winkler(a, b, &actual) == 0); \
failed = (actual != expected); \
problems |= failed; \
printf("[TEST] expected: %f, actual: %f, accuracy: n/a, result: %s, (a: %s), (b: %s)\n", \
expected, actual, (failed) ? "FAIL" : "pass", a, b);
# define test2( expected, a, b, accuracy ) \
assert(vlc_jaro_winkler(a, b, &actual) == 0); \
failed = (fabs(expected - actual) >= accuracy); \
problems |= failed; \
printf("[TEST] expected: %f, actual: %f, accuracy: %f, result: %s, (a: %s), (b: %s)\n", \
expected, actual, accuracy, (failed) ? "FAIL": "pass", a, b);
int main( void )
{
bool problems = false, failed = false;
float actual;
// both_empty
test1(1.0, "", "");
// first_empty
test1(0.0, "", "jaro-winkler");
// second_empty
test1(0.0, "distance", "");
// same
test1(1.0, "Jaro-Winkler", "Jaro-Winkler");
// diff_short
test2(0.813, "dixon", "dicksonx", 0.001);
test2(0.813, "dicksonx", "dixon", 0.001);
// same_one_character
test1(1.0, "a", "a");
// diff_one_character
test1(0.0, "a", "b");
// diff_no_transposition
test2(0.840, "dwayne", "duane", 0.001);
// diff_with_transposition
test2(0.961, "martha", "marhta", 0.001);
// names
test2(0.562, "Friedrich Nietzsche", "Fran-Paul Sartre", 0.001);
// long_prefix
test2(0.911, "cheeseburger", "cheese fries", 0.001);
// more_names
test2(0.868, "Thorkel", "Thorgier", 0.001);
// length_of_one
test2(0.738, "Dinsdale", "D", 0.001);
// very_long_prefix
test2(1.0, "thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery", 0.001);
return (problems) ? -1 : 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment