Improve UTF-8 handling.

This should allow case-insensitive matching for non-Latin characters, and fix matching for characters with diacritics.
author: Phil Jones <philj56@gmail.com> 2022-10-18 19:33:41 +0100
committer: Phil Jones <philj56@gmail.com> 2022-10-18 19:33:41 +0100
commit: 5482f0be746a98bdd6b2c54183b54dd2ff2a0192 (patch)
tree: dc58c6a1c486432f853bd3a5f9f7c78767c7292a
parent: 8872f664671711b97e02fe97f31746b5e158e627 (diff)
9 files changed, 152 insertions, 16 deletions
diff --git a/meson.build b/meson.build
index 9f2dcec..97f8eae 100644
--- a/meson.build
+++ b/meson.build
@@ -108,6 +108,7 @@ tofi_sources = files(
   'src/shm.c',
   'src/string_vec.c',
   'src/surface.c',
+  'src/utf8.c',
   'src/wlr-layer-shell-unstable-v1.c',
   'src/xmalloc.c',
 )
@@ -119,6 +120,7 @@ compgen_sources = files(
   'src/log.c',
   'src/mkdirp.c',
   'src/string_vec.c',
+  'src/utf8.c',
   'src/xmalloc.c'
 )
 
@@ -183,6 +185,7 @@ executable(
 executable(
   'tofi-compgen',
   compgen_sources,
+  dependencies: [glib],
   install: false
 )
 
diff --git a/src/desktop_vec.c b/src/desktop_vec.c
index df4218b..f559551 100644
--- a/src/desktop_vec.c
+++ b/src/desktop_vec.c
@@ -4,6 +4,7 @@
 #include "fuzzy_match.h"
 #include "log.h"
 #include "string_vec.h"
+#include "utf8.h"
 #include "xmalloc.h"
 
 static bool match_current_desktop(char * const *desktop_list, gsize length);
@@ -42,7 +43,10 @@ void desktop_vec_add(
 		vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
 	}
 	vec->buf[vec->count].id = xstrdup(id);
-	vec->buf[vec->count].name = xstrdup(name);
+	vec->buf[vec->count].name = utf8_normalize(name);
+	if (vec->buf[vec->count].name == NULL) {
+		vec->buf[vec->count].name = xstrdup(name);
+	}
 	vec->buf[vec->count].path = xstrdup(path);
 	vec->buf[vec->count].keywords = xstrdup(keywords);
 	vec->buf[vec->count].search_score = 0;
diff --git a/src/entry_backend/harfbuzz.c b/src/entry_backend/harfbuzz.c
index c5fc07e..734d305 100644
--- a/src/entry_backend/harfbuzz.c
+++ b/src/entry_backend/harfbuzz.c
@@ -5,6 +5,7 @@
 #include "../entry.h"
 #include "../log.h"
 #include "../nelem.h"
+#include "../utf8.h"
 #include "../xmalloc.h"
 
 /*
@@ -368,7 +369,7 @@ void entry_backend_harfbuzz_update(struct entry *entry)
 			char *postmatch = NULL;
 			cairo_text_extents_t subextents;
 			if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
-				char *match_pos = strcasestr(prematch, entry->input_mb);
+				char *match_pos = utf8_strcasestr(prematch, entry->input_mb);
 				if (match_pos != NULL) {
 					match = xstrdup(result);
 					prematch_len = (match_pos - prematch);
diff --git a/src/entry_backend/pango.c b/src/entry_backend/pango.c
index 1f19bce..1cc7628 100644
--- a/src/entry_backend/pango.c
+++ b/src/entry_backend/pango.c
@@ -4,6 +4,7 @@
 #include "../entry.h"
 #include "../log.h"
 #include "../nelem.h"
+#include "../utf8.h"
 #include "../xmalloc.h"
 
 #undef MAX
@@ -181,7 +182,7 @@ void entry_backend_pango_update(struct entry *entry)
 			PangoRectangle ink_subrect;
 			PangoRectangle logical_subrect;
 			if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
-				char *match_pos = strcasestr(str, entry->input_mb);
+				char *match_pos = utf8_strcasestr(str, entry->input_mb);
 				if (match_pos != NULL) {
 					prematch_len = (match_pos - str);
 					postmatch_len = strlen(str) - prematch_len - match_len;
diff --git a/src/fuzzy_match.c b/src/fuzzy_match.c
index 94c60e8..b52aa13 100644
--- a/src/fuzzy_match.c
+++ b/src/fuzzy_match.c
@@ -5,6 +5,7 @@
 #include <string.h>
 
 #include "fuzzy_match.h"
+#include "utf8.h"
 #include "xmalloc.h"
 
 #undef MAX
@@ -30,10 +31,10 @@ int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *rest
 {
 	int32_t score = 0;
 	char *saveptr = NULL;
-	char *tmp = xstrdup(patterns);
+	char *tmp = utf8_normalize(patterns);
 	char *pattern = strtok_r(tmp, " ", &saveptr);
 	while (pattern != NULL) {
-		char *c = strcasestr(str, pattern);
+		char *c = utf8_strcasestr(str, pattern);
 		if (c == NULL) {
 			score = INT32_MIN;
 			break;
@@ -55,7 +56,7 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
 {
 	int32_t score = 0;
 	char *saveptr = NULL;
-	char *tmp = xstrdup(patterns);
+	char *tmp = utf8_normalize(patterns);
 	char *pattern = strtok_r(tmp, " ", &saveptr);
 	while (pattern != NULL) {
 		int32_t word_score = fuzzy_match(pattern, str);
@@ -78,8 +79,8 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
 int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
 {
 	const int unmatched_letter_penalty = -1;
-	const size_t slen = strlen(str);
-	const size_t plen = strlen(pattern);
+	const size_t slen = utf8_strlen(str);
+	const size_t plen = utf8_strlen(pattern);
 	int32_t score = 0;
 
 	if (*pattern == '\0') {
@@ -119,7 +120,7 @@ int32_t fuzzy_match_recurse(
 	}
 
 	const char *match = str;
-	const char search[2] = { *pattern, '\0' };
+	uint32_t search = utf8_get_char(pattern);
 
 	int32_t best_score = INT32_MIN;
 
@@ -127,11 +128,15 @@ int32_t fuzzy_match_recurse(
 	 * Find all occurrences of the next pattern character in str, and
 	 * recurse on them.
 	 */
-	while ((match = strcasestr(match, search)) != NULL) {
+	while ((match = utf8_strcasechr(match, search)) != NULL) {
+		int32_t jump = 0;
+		for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
+			jump++;
+		}
 		int32_t subscore = fuzzy_match_recurse(
-				pattern + 1,
-				match + 1,
-				compute_score(match - str, first_char, match),
+				utf8_next_char(pattern),
+				utf8_next_char(match),
+				compute_score(jump, first_char, match),
 				false);
 		best_score = MAX(best_score, subscore);
 		match++;
@@ -172,15 +177,18 @@ int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
 
 	int32_t score = 0;
 
+	const uint32_t cur = utf8_get_char(match);
+
 	/* Apply bonuses. */
 	if (!first_char && jump == 0) {
 		score += adjacency_bonus;
 	}
 	if (!first_char || jump > 0) {
-		if (isupper(*match) && islower(*(match - 1))) {
+		const uint32_t prev = utf8_get_char(utf8_prev_char(match));
+		if (utf8_isupper(cur) && utf8_islower(prev)) {
 			score += camel_bonus;
 		}
-		if (isalnum(*match) && !isalnum(*(match - 1))) {
+		if (utf8_isalnum(cur) && !utf8_isalnum(prev)) {
 			score += separator_bonus;
 		}
 	}
diff --git a/src/main.c b/src/main.c
index 212211d..043f6a8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -25,6 +25,7 @@
 #include "nelem.h"
 #include "shm.h"
 #include "string_vec.h"
+#include "string_vec.h"
 #include "xmalloc.h"
 
 #undef MAX
diff --git a/src/string_vec.c b/src/string_vec.c
index 50dd813..2ef40a2 100644
--- a/src/string_vec.c
+++ b/src/string_vec.c
@@ -6,6 +6,7 @@
 #include <sys/mman.h>
 #include "fuzzy_match.h"
 #include "string_vec.h"
+#include "utf8.h"
 #include "xmalloc.h"
 
 static int cmpstringp(const void *restrict a, const void *restrict b)
@@ -80,7 +81,10 @@ void string_vec_add(struct string_vec *restrict vec, const char *restrict str)
 		vec->size *= 2;
 		vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
 	}
-	vec->buf[vec->count].string = xstrdup(str);
+	vec->buf[vec->count].string = utf8_normalize(str);
+	if (vec->buf[vec->count].string == NULL) {
+		vec->buf[vec->count].string = xstrdup(str);
+	}
 	vec->buf[vec->count].search_score = 0;
 	vec->buf[vec->count].history_score = 0;
 	vec->count++;
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..7ed6046
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,92 @@
+#include <string.h>
+
+#include "utf8.h"
+
+uint32_t utf8_isupper(uint32_t c)
+{
+	return g_unichar_isupper(c);
+}
+
+uint32_t utf8_islower(uint32_t c)
+{
+	return g_unichar_islower(c);
+}
+
+uint32_t utf8_isalnum(uint32_t c)
+{
+	return g_unichar_isalnum(c);
+}
+
+uint32_t utf8_toupper(uint32_t c)
+{
+	return g_unichar_toupper(c);
+}
+
+uint32_t utf8_tolower(uint32_t c)
+{
+	return g_unichar_tolower(c);
+}
+
+uint32_t utf8_get_char(const char *s)
+{
+	return g_utf8_get_char(s);
+}
+
+char *utf8_next_char(const char *s)
+{
+	return g_utf8_next_char(s);
+}
+
+char *utf8_prev_char(const char *s)
+{
+	return g_utf8_prev_char(s);
+}
+
+char *utf8_strchr(const char *s, uint32_t c)
+{
+	return g_utf8_strchr(s, -1, c);
+}
+
+char *utf8_strcasechr(const char *s, uint32_t c)
+{
+	c = g_unichar_tolower(c);
+
+	const char *p = s;
+	while (*p != '\0' && g_unichar_tolower(g_utf8_get_char(p)) != c) {
+		p = g_utf8_next_char(p);
+	}
+	if (*p == '\0') {
+		return NULL;
+	}
+	return (char *)p;
+}
+
+size_t utf8_strlen(const char *s)
+{
+	return g_utf8_strlen(s, -1);
+}
+
+char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle)
+{
+	char *h = g_utf8_casefold(haystack, -1);
+	char *n = g_utf8_casefold(needle, -1);
+
+	char *cmp = strstr(h, n);
+	char *ret;
+
+	if (cmp == NULL) {
+		ret = NULL;
+	} else {
+		ret = (char *)haystack + (cmp - h);
+	}
+
+	free(h);
+	free(n);
+
+	return ret;
+}
+
+char *utf8_normalize(const char *s)
+{
+	return g_utf8_normalize(s, -1, G_NORMALIZE_DEFAULT);
+}
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..b6ee986
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,22 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <glib.h>
+#include <stdint.h>
+
+uint32_t utf8_isupper(uint32_t c);
+uint32_t utf8_islower(uint32_t c);
+uint32_t utf8_isalnum(uint32_t c);
+uint32_t utf8_toupper(uint32_t c);
+uint32_t utf8_tolower(uint32_t c);
+
+uint32_t utf8_get_char(const char *s);
+char *utf8_next_char(const char *s);
+char *utf8_prev_char(const char *s);
+char *utf8_strchr(const char *s, uint32_t c);
+char *utf8_strcasechr(const char *s, uint32_t c);
+size_t utf8_strlen(const char *s);
+char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle);
+char *utf8_normalize(const char *s);
+
+#endif /* UTF8_H */
author	Phil Jones <philj56@gmail.com>	2022-10-18 19:33:41 +0100
committer	Phil Jones <philj56@gmail.com>	2022-10-18 19:33:41 +0100
commit	5482f0be746a98bdd6b2c54183b54dd2ff2a0192 (patch)
tree	dc58c6a1c486432f853bd3a5f9f7c78767c7292a
parent	8872f664671711b97e02fe97f31746b5e158e627 (diff)