#include #include #include #include #include #include "matching.h" #include "unicode.h" #include "xmalloc.h" #undef MAX #define MAX(a, b) ((a) > (b) ? (a) : (b)) static int32_t simple_match_words( const char *restrict patterns, const char *restrict str); static int32_t prefix_match_words( const char *restrict patterns, const char *restrict str); static int32_t fuzzy_match_words( const char *restrict patterns, const char *restrict str); static int32_t fuzzy_match( const char *restrict pattern, const char *restrict str); static int32_t fuzzy_match_recurse( const char *restrict pattern, const char *restrict str, int32_t score, bool first_match_only, bool first_char); static int32_t compute_score( int32_t jump, bool first_char, const char *restrict match); /* * Select the appropriate algorithm, and return its score. * Each algorithm returns larger scores for better matches, * and returns INT32_MIN if a word is not found. */ int32_t match_words( enum matching_algorithm algorithm, const char *restrict patterns, const char *restrict str) { switch (algorithm) { case MATCHING_ALGORITHM_NORMAL: return simple_match_words(patterns, str); case MATCHING_ALGORITHM_PREFIX: return prefix_match_words(patterns, str); case MATCHING_ALGORITHM_FUZZY: return fuzzy_match_words(patterns, str); default: return INT32_MIN; } } /* * Split patterns into words, and perform simple matching against str for each. * Returns the negative sum of substring distances from the start of str. * If a word is not found, returns INT32_MIN. */ int32_t simple_match_words(const char *restrict patterns, const char *restrict str) { int32_t score = 0; char *saveptr = NULL; char *tmp = utf8_normalize(patterns); char *pattern = strtok_r(tmp, " ", &saveptr); while (pattern != NULL) { char *c = utf8_strcasestr(str, pattern); if (c == NULL) { score = INT32_MIN; break; } else { score -= c - str; } pattern = strtok_r(NULL, " ", &saveptr); } free(tmp); return score; } /* * Split patterns into words, and perform prefix matching against str for each. * Returns the negative sum of remaining string suffix lengths. * If a word is not found, returns INT32_MIN. */ int32_t prefix_match_words(const char *restrict patterns, const char *restrict str) { int32_t score = 0; char *saveptr = NULL; char *tmp = utf8_normalize(patterns); char *pattern = strtok_r(tmp, " ", &saveptr); while (pattern != NULL) { char *c = utf8_strcasestr(str, pattern); if (c != str) { score = INT32_MIN; break; } else { score -= utf8_strlen(str) - utf8_strlen(pattern); } pattern = strtok_r(NULL, " ", &saveptr); } free(tmp); return score; } /* * Split patterns into words, and return the sum of fuzzy_match(word, str). * If a word is not found, returns INT32_MIN. */ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str) { int32_t score = 0; char *saveptr = NULL; char *tmp = utf8_normalize(patterns); char *pattern = strtok_r(tmp, " ", &saveptr); while (pattern != NULL) { int32_t word_score = fuzzy_match(pattern, str); if (word_score == INT32_MIN) { score = INT32_MIN; break; } else { score += word_score; } pattern = strtok_r(NULL, " ", &saveptr); } free(tmp); return score; } /* * Returns score if each character in pattern is found sequentially within str. * Returns INT32_MIN otherwise. */ int32_t fuzzy_match(const char *restrict pattern, const char *restrict str) { const int unmatched_letter_penalty = -1; const size_t slen = utf8_strlen(str); const size_t plen = utf8_strlen(pattern); int32_t score = 0; if (*pattern == '\0') { return score; } if (slen < plen) { return INT32_MIN; } /* We can already penalise any unused letters. */ score += unmatched_letter_penalty * (int32_t)(slen - plen); /* * If the string is more than 100 characters, just find the first fuzzy * match rather than the best. * * This is required as the number of possible matches (for patterns and * strings all consisting of one letter) scales something like: * * slen! / (plen! (slen - plen)!) ~ slen^plen for plen << slen * * This quickly grinds everything to a halt. 100 is chosen fairly * arbitrarily from the following logic: * * - e is the most common character in English, at around 13% of * letters. Depending on the context, let's say this be up to 20%. * - 100 * 0.20 = 20 repeats of the same character. * - In the worst case here, 20! / (10! 10!) ~200,000 possible matches, * which is "slow but not frozen" for my machine. * * In reality, this worst case shouldn't be hit, and finding the "best" * fuzzy match in lines of text > 100 characters isn't really in scope * for a dmenu clone. */ bool first_match_only = slen > 100; /* Perform the match. */ score = fuzzy_match_recurse(pattern, str, score, first_match_only, true); return score; } /* * Recursively match the whole of pattern against str. * The score parameter is the score of the previously matched character. * * This reaches a maximum recursion depth of strlen(pattern) + 1. However, the * stack usage is small (the maximum I've seen on x86_64 is 144 bytes with * gcc -O3), so this shouldn't matter unless pattern contains thousands of * characters. */ int32_t fuzzy_match_recurse( const char *restrict pattern, const char *restrict str, int32_t score, bool first_match_only, bool first_char) { if (*pattern == '\0') { /* We've matched the full pattern. */ return score; } const char *match = str; uint32_t search = utf8_to_utf32(pattern); int32_t best_score = INT32_MIN; /* * Find all occurrences of the next pattern character in str, and * recurse on them. */ while ((match = utf8_strcasechr(match, search)) != NULL) { int32_t jump = 0; for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) { jump++; } int32_t subscore = fuzzy_match_recurse( utf8_next_char(pattern), utf8_next_char(match), compute_score(jump, first_char, match), first_match_only, false); best_score = MAX(best_score, subscore); match = utf8_next_char(match); if (first_match_only) { break; } } if (best_score == INT32_MIN) { /* We couldn't match the rest of the pattern. */ return INT32_MIN; } else { return score + best_score; } } /* * Calculate the score for a single matching letter. * The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith, * which is licensed to the public domain. * * The factors affecting score are: * - Bonuses: * - If there are multiple adjacent matches. * - If a match occurs after a separator character. * - If a match is uppercase, and the previous character is lowercase. * * - Penalties: * - If there are letters before the first match. * - If there are superfluous characters in str (already accounted for). */ int32_t compute_score(int32_t jump, bool first_char, const char *restrict match) { const int adjacency_bonus = 15; const int separator_bonus = 30; const int camel_bonus = 30; const int first_letter_bonus = 15; const int leading_letter_penalty = -5; const int max_leading_letter_penalty = -15; int32_t score = 0; const uint32_t cur = utf8_to_utf32(match); /* Apply bonuses. */ if (!first_char && jump == 0) { score += adjacency_bonus; } if (!first_char || jump > 0) { const uint32_t prev = utf8_to_utf32(utf8_prev_char(match)); if (utf32_isupper(cur) && utf32_islower(prev)) { score += camel_bonus; } if (utf32_isalnum(cur) && !utf32_isalnum(prev)) { score += separator_bonus; } } if (first_char && jump == 0) { /* Match at start of string gets separator bonus. */ score += first_letter_bonus; } /* Apply penalties. */ if (first_char) { score += MAX(leading_letter_penalty * jump, max_leading_letter_penalty); } return score; }