summaryrefslogtreecommitdiff
path: root/src/matching.c
blob: b0d184f5f7a7e89be8a23756b5452bd7ef9356ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include "matching.h"
#include "unicode.h"
#include "xmalloc.h"

#undef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))

static int32_t simple_match_words(
		const char *restrict patterns,
		const char *restrict str);

static int32_t prefix_match_words(
		const char *restrict patterns,
		const char *restrict str);

static int32_t fuzzy_match_words(
		const char *restrict patterns,
		const char *restrict str);

static int32_t fuzzy_match(
		const char *restrict pattern,
		const char *restrict str);

static int32_t fuzzy_match_recurse(
		const char *restrict pattern,
		const char *restrict str,
		int32_t score,
		bool first_match_only,
		bool first_char);

static int32_t compute_score(
		int32_t jump,
		bool first_char,
		const char *restrict match);

/*
 * Select the appropriate algorithm, and return its score.
 * Each algorithm returns larger scores for better matches,
 * and returns INT32_MIN if a word is not found.
 */
int32_t match_words(
		enum matching_algorithm algorithm,
		const char *restrict patterns,
		const char *restrict str)
{
	switch (algorithm) {
		case MATCHING_ALGORITHM_NORMAL:
			return simple_match_words(patterns, str);
		case MATCHING_ALGORITHM_PREFIX:
			return prefix_match_words(patterns, str);
		case MATCHING_ALGORITHM_FUZZY:
			return fuzzy_match_words(patterns, str);
		default:
			return INT32_MIN;
	}
}

/*
 * Split patterns into words, and perform simple matching against str for each.
 * Returns the negative sum of substring distances from the start of str.
 * If a word is not found, returns INT32_MIN.
 */
int32_t simple_match_words(const char *restrict patterns, const char *restrict str)
{
	int32_t score = 0;
	char *saveptr = NULL;
	char *tmp = utf8_normalize(patterns);
	char *pattern = strtok_r(tmp, " ", &saveptr);
	while (pattern != NULL) {
		char *c = utf8_strcasestr(str, pattern);
		if (c == NULL) {
			score = INT32_MIN;
			break;
		} else {
			score -= c - str;
		}
		pattern = strtok_r(NULL, " ", &saveptr);
	}
	free(tmp);
	return score;
}

/*
 * Split patterns into words, and perform prefix matching against str for each.
 * Returns the negative sum of remaining string suffix lengths.
 * If a word is not found, returns INT32_MIN.
 */
int32_t prefix_match_words(const char *restrict patterns, const char *restrict str)
{
	int32_t score = 0;
	char *saveptr = NULL;
	char *tmp = utf8_normalize(patterns);
	char *pattern = strtok_r(tmp, " ", &saveptr);
	while (pattern != NULL) {
		char *c = utf8_strcasestr(str, pattern);
		if (c != str) {
			score = INT32_MIN;
			break;
		} else {
			score -= utf8_strlen(str) - utf8_strlen(pattern);
		}
		pattern = strtok_r(NULL, " ", &saveptr);
	}
	free(tmp);
	return score;
}


/*
 * Split patterns into words, and return the sum of fuzzy_match(word, str).
 * If a word is not found, returns INT32_MIN.
 */
int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str)
{
	int32_t score = 0;
	char *saveptr = NULL;
	char *tmp = utf8_normalize(patterns);
	char *pattern = strtok_r(tmp, " ", &saveptr);
	while (pattern != NULL) {
		int32_t word_score = fuzzy_match(pattern, str);
		if (word_score == INT32_MIN) {
			score = INT32_MIN;
			break;
		} else {
			score += word_score;
		}
		pattern = strtok_r(NULL, " ", &saveptr);
	}
	free(tmp);
	return score;
}

/*
 * Returns score if each character in pattern is found sequentially within str.
 * Returns INT32_MIN otherwise.
 */
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
	const int unmatched_letter_penalty = -1;
	const size_t slen = utf8_strlen(str);
	const size_t plen = utf8_strlen(pattern);
	int32_t score = 0;

	if (*pattern == '\0') {
		return score;
	}
	if (slen < plen) {
		return INT32_MIN;
	}

	/* We can already penalise any unused letters. */
	score += unmatched_letter_penalty * (int32_t)(slen - plen);

        /*
         * If the string is more than 100 characters, just find the first fuzzy
         * match rather than the best.
         *
         * This is required as the number of possible matches (for patterns and
         * strings all consisting of one letter) scales something like:
         *
         * slen! / (plen! (slen - plen)!)  ~  slen^plen for plen << slen
         *
         * This quickly grinds everything to a halt. 100 is chosen fairly
         * arbitrarily from the following logic:
         *
         * - e is the most common character in English, at around 13% of
         *   letters. Depending on the context, let's say this be up to 20%.
         * - 100 * 0.20 = 20 repeats of the same character.
         * - In the worst case here, 20! / (10! 10!) ~200,000 possible matches,
         *   which is "slow but not frozen" for my machine.
         *
         * In reality, this worst case shouldn't be hit, and finding the "best"
         * fuzzy match in lines of text > 100 characters isn't really in scope
         * for a dmenu clone.
         */
        bool first_match_only = slen > 100;

	/* Perform the match. */
	score = fuzzy_match_recurse(pattern, str, score, first_match_only, true);

	return score;
}

/*
 * Recursively match the whole of pattern against str.
 * The score parameter is the score of the previously matched character.
 *
 * This reaches a maximum recursion depth of strlen(pattern) + 1. However, the
 * stack usage is small (the maximum I've seen on x86_64 is 144 bytes with
 * gcc -O3), so this shouldn't matter unless pattern contains thousands of
 * characters.
 */
int32_t fuzzy_match_recurse(
		const char *restrict pattern,
		const char *restrict str,
		int32_t score,
		bool first_match_only,
		bool first_char)
{
	if (*pattern == '\0') {
		/* We've matched the full pattern. */
		return score;
	}

	const char *match = str;
	uint32_t search = utf8_to_utf32(pattern);

	int32_t best_score = INT32_MIN;

	/*
	 * Find all occurrences of the next pattern character in str, and
	 * recurse on them.
	 */
	while ((match = utf8_strcasechr(match, search)) != NULL) {
		int32_t jump = 0;
		for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
			jump++;
		}
		int32_t subscore = fuzzy_match_recurse(
				utf8_next_char(pattern),
				utf8_next_char(match),
				compute_score(jump, first_char, match),
				first_match_only,
				false);
		best_score = MAX(best_score, subscore);
		match = utf8_next_char(match);

		if (first_match_only) {
			break;
		}
	}

	if (best_score == INT32_MIN) {
		/* We couldn't match the rest of the pattern. */
		return INT32_MIN;
	} else {
		return score + best_score;
	}
}

/*
 * Calculate the score for a single matching letter.
 * The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith,
 * which is licensed to the public domain.
 *
 * The factors affecting score are:
 *   - Bonuses:
 *     - If there are multiple adjacent matches.
 *     - If a match occurs after a separator character.
 *     - If a match is uppercase, and the previous character is lowercase.
 *
 *   - Penalties:
 *     - If there are letters before the first match.
 *     - If there are superfluous characters in str (already accounted for).
 */
int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
{
	const int adjacency_bonus = 15;
	const int separator_bonus = 30;
	const int camel_bonus = 30;
	const int first_letter_bonus = 15;

	const int leading_letter_penalty = -5;
	const int max_leading_letter_penalty = -15;

	int32_t score = 0;

	const uint32_t cur = utf8_to_utf32(match);

	/* Apply bonuses. */
	if (!first_char && jump == 0) {
		score += adjacency_bonus;
	}
	if (!first_char || jump > 0) {
		const uint32_t prev = utf8_to_utf32(utf8_prev_char(match));
		if (utf32_isupper(cur) && utf32_islower(prev)) {
			score += camel_bonus;
		}
		if (utf32_isalnum(cur) && !utf32_isalnum(prev)) {
			score += separator_bonus;
		}
	}
	if (first_char && jump == 0) {
		/* Match at start of string gets separator bonus. */
		score += first_letter_bonus;
	}

	/* Apply penalties. */
	if (first_char) {
		score += MAX(leading_letter_penalty * jump,
				max_leading_letter_penalty);
	}

	return score;
}