diff options
Diffstat (limited to 'src/matching.c')
-rw-r--r-- | src/matching.c | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/src/matching.c b/src/matching.c new file mode 100644 index 0000000..b0d184f --- /dev/null +++ b/src/matching.c @@ -0,0 +1,301 @@ +#include <ctype.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "matching.h" +#include "unicode.h" +#include "xmalloc.h" + +#undef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static int32_t simple_match_words( + const char *restrict patterns, + const char *restrict str); + +static int32_t prefix_match_words( + const char *restrict patterns, + const char *restrict str); + +static int32_t fuzzy_match_words( + const char *restrict patterns, + const char *restrict str); + +static int32_t fuzzy_match( + const char *restrict pattern, + const char *restrict str); + +static int32_t fuzzy_match_recurse( + const char *restrict pattern, + const char *restrict str, + int32_t score, + bool first_match_only, + bool first_char); + +static int32_t compute_score( + int32_t jump, + bool first_char, + const char *restrict match); + +/* + * Select the appropriate algorithm, and return its score. + * Each algorithm returns larger scores for better matches, + * and returns INT32_MIN if a word is not found. + */ +int32_t match_words( + enum matching_algorithm algorithm, + const char *restrict patterns, + const char *restrict str) +{ + switch (algorithm) { + case MATCHING_ALGORITHM_NORMAL: + return simple_match_words(patterns, str); + case MATCHING_ALGORITHM_PREFIX: + return prefix_match_words(patterns, str); + case MATCHING_ALGORITHM_FUZZY: + return fuzzy_match_words(patterns, str); + default: + return INT32_MIN; + } +} + +/* + * Split patterns into words, and perform simple matching against str for each. + * Returns the negative sum of substring distances from the start of str. + * If a word is not found, returns INT32_MIN. + */ +int32_t simple_match_words(const char *restrict patterns, const char *restrict str) +{ + int32_t score = 0; + char *saveptr = NULL; + char *tmp = utf8_normalize(patterns); + char *pattern = strtok_r(tmp, " ", &saveptr); + while (pattern != NULL) { + char *c = utf8_strcasestr(str, pattern); + if (c == NULL) { + score = INT32_MIN; + break; + } else { + score -= c - str; + } + pattern = strtok_r(NULL, " ", &saveptr); + } + free(tmp); + return score; +} + +/* + * Split patterns into words, and perform prefix matching against str for each. + * Returns the negative sum of remaining string suffix lengths. + * If a word is not found, returns INT32_MIN. + */ +int32_t prefix_match_words(const char *restrict patterns, const char *restrict str) +{ + int32_t score = 0; + char *saveptr = NULL; + char *tmp = utf8_normalize(patterns); + char *pattern = strtok_r(tmp, " ", &saveptr); + while (pattern != NULL) { + char *c = utf8_strcasestr(str, pattern); + if (c != str) { + score = INT32_MIN; + break; + } else { + score -= utf8_strlen(str) - utf8_strlen(pattern); + } + pattern = strtok_r(NULL, " ", &saveptr); + } + free(tmp); + return score; +} + + +/* + * Split patterns into words, and return the sum of fuzzy_match(word, str). + * If a word is not found, returns INT32_MIN. + */ +int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str) +{ + int32_t score = 0; + char *saveptr = NULL; + char *tmp = utf8_normalize(patterns); + char *pattern = strtok_r(tmp, " ", &saveptr); + while (pattern != NULL) { + int32_t word_score = fuzzy_match(pattern, str); + if (word_score == INT32_MIN) { + score = INT32_MIN; + break; + } else { + score += word_score; + } + pattern = strtok_r(NULL, " ", &saveptr); + } + free(tmp); + return score; +} + +/* + * Returns score if each character in pattern is found sequentially within str. + * Returns INT32_MIN otherwise. + */ +int32_t fuzzy_match(const char *restrict pattern, const char *restrict str) +{ + const int unmatched_letter_penalty = -1; + const size_t slen = utf8_strlen(str); + const size_t plen = utf8_strlen(pattern); + int32_t score = 0; + + if (*pattern == '\0') { + return score; + } + if (slen < plen) { + return INT32_MIN; + } + + /* We can already penalise any unused letters. */ + score += unmatched_letter_penalty * (int32_t)(slen - plen); + + /* + * If the string is more than 100 characters, just find the first fuzzy + * match rather than the best. + * + * This is required as the number of possible matches (for patterns and + * strings all consisting of one letter) scales something like: + * + * slen! / (plen! (slen - plen)!) ~ slen^plen for plen << slen + * + * This quickly grinds everything to a halt. 100 is chosen fairly + * arbitrarily from the following logic: + * + * - e is the most common character in English, at around 13% of + * letters. Depending on the context, let's say this be up to 20%. + * - 100 * 0.20 = 20 repeats of the same character. + * - In the worst case here, 20! / (10! 10!) ~200,000 possible matches, + * which is "slow but not frozen" for my machine. + * + * In reality, this worst case shouldn't be hit, and finding the "best" + * fuzzy match in lines of text > 100 characters isn't really in scope + * for a dmenu clone. + */ + bool first_match_only = slen > 100; + + /* Perform the match. */ + score = fuzzy_match_recurse(pattern, str, score, first_match_only, true); + + return score; +} + +/* + * Recursively match the whole of pattern against str. + * The score parameter is the score of the previously matched character. + * + * This reaches a maximum recursion depth of strlen(pattern) + 1. However, the + * stack usage is small (the maximum I've seen on x86_64 is 144 bytes with + * gcc -O3), so this shouldn't matter unless pattern contains thousands of + * characters. + */ +int32_t fuzzy_match_recurse( + const char *restrict pattern, + const char *restrict str, + int32_t score, + bool first_match_only, + bool first_char) +{ + if (*pattern == '\0') { + /* We've matched the full pattern. */ + return score; + } + + const char *match = str; + uint32_t search = utf8_to_utf32(pattern); + + int32_t best_score = INT32_MIN; + + /* + * Find all occurrences of the next pattern character in str, and + * recurse on them. + */ + while ((match = utf8_strcasechr(match, search)) != NULL) { + int32_t jump = 0; + for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) { + jump++; + } + int32_t subscore = fuzzy_match_recurse( + utf8_next_char(pattern), + utf8_next_char(match), + compute_score(jump, first_char, match), + first_match_only, + false); + best_score = MAX(best_score, subscore); + match = utf8_next_char(match); + + if (first_match_only) { + break; + } + } + + if (best_score == INT32_MIN) { + /* We couldn't match the rest of the pattern. */ + return INT32_MIN; + } else { + return score + best_score; + } +} + +/* + * Calculate the score for a single matching letter. + * The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith, + * which is licensed to the public domain. + * + * The factors affecting score are: + * - Bonuses: + * - If there are multiple adjacent matches. + * - If a match occurs after a separator character. + * - If a match is uppercase, and the previous character is lowercase. + * + * - Penalties: + * - If there are letters before the first match. + * - If there are superfluous characters in str (already accounted for). + */ +int32_t compute_score(int32_t jump, bool first_char, const char *restrict match) +{ + const int adjacency_bonus = 15; + const int separator_bonus = 30; + const int camel_bonus = 30; + const int first_letter_bonus = 15; + + const int leading_letter_penalty = -5; + const int max_leading_letter_penalty = -15; + + int32_t score = 0; + + const uint32_t cur = utf8_to_utf32(match); + + /* Apply bonuses. */ + if (!first_char && jump == 0) { + score += adjacency_bonus; + } + if (!first_char || jump > 0) { + const uint32_t prev = utf8_to_utf32(utf8_prev_char(match)); + if (utf32_isupper(cur) && utf32_islower(prev)) { + score += camel_bonus; + } + if (utf32_isalnum(cur) && !utf32_isalnum(prev)) { + score += separator_bonus; + } + } + if (first_char && jump == 0) { + /* Match at start of string gets separator bonus. */ + score += first_letter_bonus; + } + + /* Apply penalties. */ + if (first_char) { + score += MAX(leading_letter_penalty * jump, + max_leading_letter_penalty); + } + + return score; +} |