summaryrefslogtreecommitdiff
path: root/src/matching.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/matching.c')
-rw-r--r--src/matching.c301
1 files changed, 301 insertions, 0 deletions
diff --git a/src/matching.c b/src/matching.c
new file mode 100644
index 0000000..b0d184f
--- /dev/null
+++ b/src/matching.c
@@ -0,0 +1,301 @@
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "matching.h"
+#include "unicode.h"
+#include "xmalloc.h"
+
+#undef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static int32_t simple_match_words(
+ const char *restrict patterns,
+ const char *restrict str);
+
+static int32_t prefix_match_words(
+ const char *restrict patterns,
+ const char *restrict str);
+
+static int32_t fuzzy_match_words(
+ const char *restrict patterns,
+ const char *restrict str);
+
+static int32_t fuzzy_match(
+ const char *restrict pattern,
+ const char *restrict str);
+
+static int32_t fuzzy_match_recurse(
+ const char *restrict pattern,
+ const char *restrict str,
+ int32_t score,
+ bool first_match_only,
+ bool first_char);
+
+static int32_t compute_score(
+ int32_t jump,
+ bool first_char,
+ const char *restrict match);
+
+/*
+ * Select the appropriate algorithm, and return its score.
+ * Each algorithm returns larger scores for better matches,
+ * and returns INT32_MIN if a word is not found.
+ */
+int32_t match_words(
+ enum matching_algorithm algorithm,
+ const char *restrict patterns,
+ const char *restrict str)
+{
+ switch (algorithm) {
+ case MATCHING_ALGORITHM_NORMAL:
+ return simple_match_words(patterns, str);
+ case MATCHING_ALGORITHM_PREFIX:
+ return prefix_match_words(patterns, str);
+ case MATCHING_ALGORITHM_FUZZY:
+ return fuzzy_match_words(patterns, str);
+ default:
+ return INT32_MIN;
+ }
+}
+
+/*
+ * Split patterns into words, and perform simple matching against str for each.
+ * Returns the negative sum of substring distances from the start of str.
+ * If a word is not found, returns INT32_MIN.
+ */
+int32_t simple_match_words(const char *restrict patterns, const char *restrict str)
+{
+ int32_t score = 0;
+ char *saveptr = NULL;
+ char *tmp = utf8_normalize(patterns);
+ char *pattern = strtok_r(tmp, " ", &saveptr);
+ while (pattern != NULL) {
+ char *c = utf8_strcasestr(str, pattern);
+ if (c == NULL) {
+ score = INT32_MIN;
+ break;
+ } else {
+ score -= c - str;
+ }
+ pattern = strtok_r(NULL, " ", &saveptr);
+ }
+ free(tmp);
+ return score;
+}
+
+/*
+ * Split patterns into words, and perform prefix matching against str for each.
+ * Returns the negative sum of remaining string suffix lengths.
+ * If a word is not found, returns INT32_MIN.
+ */
+int32_t prefix_match_words(const char *restrict patterns, const char *restrict str)
+{
+ int32_t score = 0;
+ char *saveptr = NULL;
+ char *tmp = utf8_normalize(patterns);
+ char *pattern = strtok_r(tmp, " ", &saveptr);
+ while (pattern != NULL) {
+ char *c = utf8_strcasestr(str, pattern);
+ if (c != str) {
+ score = INT32_MIN;
+ break;
+ } else {
+ score -= utf8_strlen(str) - utf8_strlen(pattern);
+ }
+ pattern = strtok_r(NULL, " ", &saveptr);
+ }
+ free(tmp);
+ return score;
+}
+
+
+/*
+ * Split patterns into words, and return the sum of fuzzy_match(word, str).
+ * If a word is not found, returns INT32_MIN.
+ */
+int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str)
+{
+ int32_t score = 0;
+ char *saveptr = NULL;
+ char *tmp = utf8_normalize(patterns);
+ char *pattern = strtok_r(tmp, " ", &saveptr);
+ while (pattern != NULL) {
+ int32_t word_score = fuzzy_match(pattern, str);
+ if (word_score == INT32_MIN) {
+ score = INT32_MIN;
+ break;
+ } else {
+ score += word_score;
+ }
+ pattern = strtok_r(NULL, " ", &saveptr);
+ }
+ free(tmp);
+ return score;
+}
+
+/*
+ * Returns score if each character in pattern is found sequentially within str.
+ * Returns INT32_MIN otherwise.
+ */
+int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
+{
+ const int unmatched_letter_penalty = -1;
+ const size_t slen = utf8_strlen(str);
+ const size_t plen = utf8_strlen(pattern);
+ int32_t score = 0;
+
+ if (*pattern == '\0') {
+ return score;
+ }
+ if (slen < plen) {
+ return INT32_MIN;
+ }
+
+ /* We can already penalise any unused letters. */
+ score += unmatched_letter_penalty * (int32_t)(slen - plen);
+
+ /*
+ * If the string is more than 100 characters, just find the first fuzzy
+ * match rather than the best.
+ *
+ * This is required as the number of possible matches (for patterns and
+ * strings all consisting of one letter) scales something like:
+ *
+ * slen! / (plen! (slen - plen)!) ~ slen^plen for plen << slen
+ *
+ * This quickly grinds everything to a halt. 100 is chosen fairly
+ * arbitrarily from the following logic:
+ *
+ * - e is the most common character in English, at around 13% of
+ * letters. Depending on the context, let's say this be up to 20%.
+ * - 100 * 0.20 = 20 repeats of the same character.
+ * - In the worst case here, 20! / (10! 10!) ~200,000 possible matches,
+ * which is "slow but not frozen" for my machine.
+ *
+ * In reality, this worst case shouldn't be hit, and finding the "best"
+ * fuzzy match in lines of text > 100 characters isn't really in scope
+ * for a dmenu clone.
+ */
+ bool first_match_only = slen > 100;
+
+ /* Perform the match. */
+ score = fuzzy_match_recurse(pattern, str, score, first_match_only, true);
+
+ return score;
+}
+
+/*
+ * Recursively match the whole of pattern against str.
+ * The score parameter is the score of the previously matched character.
+ *
+ * This reaches a maximum recursion depth of strlen(pattern) + 1. However, the
+ * stack usage is small (the maximum I've seen on x86_64 is 144 bytes with
+ * gcc -O3), so this shouldn't matter unless pattern contains thousands of
+ * characters.
+ */
+int32_t fuzzy_match_recurse(
+ const char *restrict pattern,
+ const char *restrict str,
+ int32_t score,
+ bool first_match_only,
+ bool first_char)
+{
+ if (*pattern == '\0') {
+ /* We've matched the full pattern. */
+ return score;
+ }
+
+ const char *match = str;
+ uint32_t search = utf8_to_utf32(pattern);
+
+ int32_t best_score = INT32_MIN;
+
+ /*
+ * Find all occurrences of the next pattern character in str, and
+ * recurse on them.
+ */
+ while ((match = utf8_strcasechr(match, search)) != NULL) {
+ int32_t jump = 0;
+ for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
+ jump++;
+ }
+ int32_t subscore = fuzzy_match_recurse(
+ utf8_next_char(pattern),
+ utf8_next_char(match),
+ compute_score(jump, first_char, match),
+ first_match_only,
+ false);
+ best_score = MAX(best_score, subscore);
+ match = utf8_next_char(match);
+
+ if (first_match_only) {
+ break;
+ }
+ }
+
+ if (best_score == INT32_MIN) {
+ /* We couldn't match the rest of the pattern. */
+ return INT32_MIN;
+ } else {
+ return score + best_score;
+ }
+}
+
+/*
+ * Calculate the score for a single matching letter.
+ * The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith,
+ * which is licensed to the public domain.
+ *
+ * The factors affecting score are:
+ * - Bonuses:
+ * - If there are multiple adjacent matches.
+ * - If a match occurs after a separator character.
+ * - If a match is uppercase, and the previous character is lowercase.
+ *
+ * - Penalties:
+ * - If there are letters before the first match.
+ * - If there are superfluous characters in str (already accounted for).
+ */
+int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
+{
+ const int adjacency_bonus = 15;
+ const int separator_bonus = 30;
+ const int camel_bonus = 30;
+ const int first_letter_bonus = 15;
+
+ const int leading_letter_penalty = -5;
+ const int max_leading_letter_penalty = -15;
+
+ int32_t score = 0;
+
+ const uint32_t cur = utf8_to_utf32(match);
+
+ /* Apply bonuses. */
+ if (!first_char && jump == 0) {
+ score += adjacency_bonus;
+ }
+ if (!first_char || jump > 0) {
+ const uint32_t prev = utf8_to_utf32(utf8_prev_char(match));
+ if (utf32_isupper(cur) && utf32_islower(prev)) {
+ score += camel_bonus;
+ }
+ if (utf32_isalnum(cur) && !utf32_isalnum(prev)) {
+ score += separator_bonus;
+ }
+ }
+ if (first_char && jump == 0) {
+ /* Match at start of string gets separator bonus. */
+ score += first_letter_bonus;
+ }
+
+ /* Apply penalties. */
+ if (first_char) {
+ score += MAX(leading_letter_penalty * jump,
+ max_leading_letter_penalty);
+ }
+
+ return score;
+}