diff options
author | Phil Jones <philj56@gmail.com> | 2023-04-17 23:43:05 +0100 |
---|---|---|
committer | Phil Jones <philj56@gmail.com> | 2023-04-17 23:43:05 +0100 |
commit | 574eff0df1aff9bdc6d32939a03312cc08803de3 (patch) | |
tree | 5aeca72f70314bee3bf95db99f10d89f0a7b4032 /src/fuzzy_match.c | |
parent | 71a4801d20d8904cfcfa5e92c96d53ee06a2c69f (diff) |
Add --matching-algorithm option.
This replaces the --fuzzy-match algorithm. Available choices are normal,
prefix and fuzzy. Levenshtein distance was investigated, but it seems
pretty rubbish for tofi's use case, where you normally want a good match when
you've only typed a small portion of the target string.
Diffstat (limited to 'src/fuzzy_match.c')
-rw-r--r-- | src/fuzzy_match.c | 238 |
1 files changed, 0 insertions, 238 deletions
diff --git a/src/fuzzy_match.c b/src/fuzzy_match.c deleted file mode 100644 index e9f1dcb..0000000 --- a/src/fuzzy_match.c +++ /dev/null @@ -1,238 +0,0 @@ -#include <ctype.h> -#include <stdbool.h> -#include <stddef.h> -#include <stdint.h> -#include <string.h> - -#include "fuzzy_match.h" -#include "unicode.h" -#include "xmalloc.h" - -#undef MAX -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -static int32_t compute_score( - int32_t jump, - bool first_char, - const char *restrict match); - -static int32_t fuzzy_match_recurse( - const char *restrict pattern, - const char *restrict str, - int32_t score, - bool first_match_only, - bool first_char); - -/* - * Split patterns into words, and perform simple matching against str for each. - * Returns the sum of substring distances from the start of str. - * If a word is not found, returns INT32_MIN. - */ -int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *restrict str) -{ - int32_t score = 0; - char *saveptr = NULL; - char *tmp = utf8_normalize(patterns); - char *pattern = strtok_r(tmp, " ", &saveptr); - while (pattern != NULL) { - char *c = utf8_strcasestr(str, pattern); - if (c == NULL) { - score = INT32_MIN; - break; - } else { - score += str - c; - } - pattern = strtok_r(NULL, " ", &saveptr); - } - free(tmp); - return score; -} - - -/* - * Split patterns into words, and return the sum of fuzzy_match(word, str). - * If a word is not found, returns INT32_MIN. - */ -int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str) -{ - int32_t score = 0; - char *saveptr = NULL; - char *tmp = utf8_normalize(patterns); - char *pattern = strtok_r(tmp, " ", &saveptr); - while (pattern != NULL) { - int32_t word_score = fuzzy_match(pattern, str); - if (word_score == INT32_MIN) { - score = INT32_MIN; - break; - } else { - score += word_score; - } - pattern = strtok_r(NULL, " ", &saveptr); - } - free(tmp); - return score; -} - -/* - * Returns score if each character in pattern is found sequentially within str. - * Returns INT32_MIN otherwise. - */ -int32_t fuzzy_match(const char *restrict pattern, const char *restrict str) -{ - const int unmatched_letter_penalty = -1; - const size_t slen = utf8_strlen(str); - const size_t plen = utf8_strlen(pattern); - int32_t score = 0; - - if (*pattern == '\0') { - return score; - } - if (slen < plen) { - return INT32_MIN; - } - - /* We can already penalise any unused letters. */ - score += unmatched_letter_penalty * (int32_t)(slen - plen); - - /* - * If the string is more than 100 characters, just find the first fuzzy - * match rather than the best. - * - * This is required as the number of possible matches (for patterns and - * strings all consisting of one letter) scales something like: - * - * slen! / (plen! (slen - plen)!) ~ slen^plen for plen << slen - * - * This quickly grinds everything to a halt. 100 is chosen fairly - * arbitrarily from the following logic: - * - * - e is the most common character in English, at around 13% of - * letters. Depending on the context, let's say this be up to 20%. - * - 100 * 0.20 = 20 repeats of the same character. - * - In the worst case here, 20! / (10! 10!) ~200,000 possible matches, - * which is "slow but not frozen" for my machine. - * - * In reality, this worst case shouldn't be hit, and finding the "best" - * fuzzy match in lines of text > 100 characters isn't really in scope - * for a dmenu clone. - */ - bool first_match_only = slen > 100; - - /* Perform the match. */ - score = fuzzy_match_recurse(pattern, str, score, first_match_only, true); - - return score; -} - -/* - * Recursively match the whole of pattern against str. - * The score parameter is the score of the previously matched character. - * - * This reaches a maximum recursion depth of strlen(pattern) + 1. However, the - * stack usage is small (the maximum I've seen on x86_64 is 144 bytes with - * gcc -O3), so this shouldn't matter unless pattern contains thousands of - * characters. - */ -int32_t fuzzy_match_recurse( - const char *restrict pattern, - const char *restrict str, - int32_t score, - bool first_match_only, - bool first_char) -{ - if (*pattern == '\0') { - /* We've matched the full pattern. */ - return score; - } - - const char *match = str; - uint32_t search = utf8_to_utf32(pattern); - - int32_t best_score = INT32_MIN; - - /* - * Find all occurrences of the next pattern character in str, and - * recurse on them. - */ - while ((match = utf8_strcasechr(match, search)) != NULL) { - int32_t jump = 0; - for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) { - jump++; - } - int32_t subscore = fuzzy_match_recurse( - utf8_next_char(pattern), - utf8_next_char(match), - compute_score(jump, first_char, match), - first_match_only, - false); - best_score = MAX(best_score, subscore); - match = utf8_next_char(match); - - if (first_match_only) { - break; - } - } - - if (best_score == INT32_MIN) { - /* We couldn't match the rest of the pattern. */ - return INT32_MIN; - } else { - return score + best_score; - } -} - -/* - * Calculate the score for a single matching letter. - * The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith, - * which is licensed to the public domain. - * - * The factors affecting score are: - * - Bonuses: - * - If there are multiple adjacent matches. - * - If a match occurs after a separator character. - * - If a match is uppercase, and the previous character is lowercase. - * - * - Penalties: - * - If there are letters before the first match. - * - If there are superfluous characters in str (already accounted for). - */ -int32_t compute_score(int32_t jump, bool first_char, const char *restrict match) -{ - const int adjacency_bonus = 15; - const int separator_bonus = 30; - const int camel_bonus = 30; - const int first_letter_bonus = 15; - - const int leading_letter_penalty = -5; - const int max_leading_letter_penalty = -15; - - int32_t score = 0; - - const uint32_t cur = utf8_to_utf32(match); - - /* Apply bonuses. */ - if (!first_char && jump == 0) { - score += adjacency_bonus; - } - if (!first_char || jump > 0) { - const uint32_t prev = utf8_to_utf32(utf8_prev_char(match)); - if (utf32_isupper(cur) && utf32_islower(prev)) { - score += camel_bonus; - } - if (utf32_isalnum(cur) && !utf32_isalnum(prev)) { - score += separator_bonus; - } - } - if (first_char && jump == 0) { - /* Match at start of string gets separator bonus. */ - score += first_letter_bonus; - } - - /* Apply penalties. */ - if (first_char) { - score += MAX(leading_letter_penalty * jump, - max_leading_letter_penalty); - } - - return score; -} |