summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Jones <philj56@gmail.com>2022-10-18 19:33:41 +0100
committerPhil Jones <philj56@gmail.com>2022-10-18 19:33:41 +0100
commit5482f0be746a98bdd6b2c54183b54dd2ff2a0192 (patch)
treedc58c6a1c486432f853bd3a5f9f7c78767c7292a
parent8872f664671711b97e02fe97f31746b5e158e627 (diff)
Improve UTF-8 handling.
This should allow case-insensitive matching for non-Latin characters, and fix matching for characters with diacritics.
-rw-r--r--meson.build3
-rw-r--r--src/desktop_vec.c6
-rw-r--r--src/entry_backend/harfbuzz.c3
-rw-r--r--src/entry_backend/pango.c3
-rw-r--r--src/fuzzy_match.c32
-rw-r--r--src/main.c1
-rw-r--r--src/string_vec.c6
-rw-r--r--src/utf8.c92
-rw-r--r--src/utf8.h22
9 files changed, 152 insertions, 16 deletions
diff --git a/meson.build b/meson.build
index 9f2dcec..97f8eae 100644
--- a/meson.build
+++ b/meson.build
@@ -108,6 +108,7 @@ tofi_sources = files(
'src/shm.c',
'src/string_vec.c',
'src/surface.c',
+ 'src/utf8.c',
'src/wlr-layer-shell-unstable-v1.c',
'src/xmalloc.c',
)
@@ -119,6 +120,7 @@ compgen_sources = files(
'src/log.c',
'src/mkdirp.c',
'src/string_vec.c',
+ 'src/utf8.c',
'src/xmalloc.c'
)
@@ -183,6 +185,7 @@ executable(
executable(
'tofi-compgen',
compgen_sources,
+ dependencies: [glib],
install: false
)
diff --git a/src/desktop_vec.c b/src/desktop_vec.c
index df4218b..f559551 100644
--- a/src/desktop_vec.c
+++ b/src/desktop_vec.c
@@ -4,6 +4,7 @@
#include "fuzzy_match.h"
#include "log.h"
#include "string_vec.h"
+#include "utf8.h"
#include "xmalloc.h"
static bool match_current_desktop(char * const *desktop_list, gsize length);
@@ -42,7 +43,10 @@ void desktop_vec_add(
vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
}
vec->buf[vec->count].id = xstrdup(id);
- vec->buf[vec->count].name = xstrdup(name);
+ vec->buf[vec->count].name = utf8_normalize(name);
+ if (vec->buf[vec->count].name == NULL) {
+ vec->buf[vec->count].name = xstrdup(name);
+ }
vec->buf[vec->count].path = xstrdup(path);
vec->buf[vec->count].keywords = xstrdup(keywords);
vec->buf[vec->count].search_score = 0;
diff --git a/src/entry_backend/harfbuzz.c b/src/entry_backend/harfbuzz.c
index c5fc07e..734d305 100644
--- a/src/entry_backend/harfbuzz.c
+++ b/src/entry_backend/harfbuzz.c
@@ -5,6 +5,7 @@
#include "../entry.h"
#include "../log.h"
#include "../nelem.h"
+#include "../utf8.h"
#include "../xmalloc.h"
/*
@@ -368,7 +369,7 @@ void entry_backend_harfbuzz_update(struct entry *entry)
char *postmatch = NULL;
cairo_text_extents_t subextents;
if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
- char *match_pos = strcasestr(prematch, entry->input_mb);
+ char *match_pos = utf8_strcasestr(prematch, entry->input_mb);
if (match_pos != NULL) {
match = xstrdup(result);
prematch_len = (match_pos - prematch);
diff --git a/src/entry_backend/pango.c b/src/entry_backend/pango.c
index 1f19bce..1cc7628 100644
--- a/src/entry_backend/pango.c
+++ b/src/entry_backend/pango.c
@@ -4,6 +4,7 @@
#include "../entry.h"
#include "../log.h"
#include "../nelem.h"
+#include "../utf8.h"
#include "../xmalloc.h"
#undef MAX
@@ -181,7 +182,7 @@ void entry_backend_pango_update(struct entry *entry)
PangoRectangle ink_subrect;
PangoRectangle logical_subrect;
if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
- char *match_pos = strcasestr(str, entry->input_mb);
+ char *match_pos = utf8_strcasestr(str, entry->input_mb);
if (match_pos != NULL) {
prematch_len = (match_pos - str);
postmatch_len = strlen(str) - prematch_len - match_len;
diff --git a/src/fuzzy_match.c b/src/fuzzy_match.c
index 94c60e8..b52aa13 100644
--- a/src/fuzzy_match.c
+++ b/src/fuzzy_match.c
@@ -5,6 +5,7 @@
#include <string.h>
#include "fuzzy_match.h"
+#include "utf8.h"
#include "xmalloc.h"
#undef MAX
@@ -30,10 +31,10 @@ int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *rest
{
int32_t score = 0;
char *saveptr = NULL;
- char *tmp = xstrdup(patterns);
+ char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
- char *c = strcasestr(str, pattern);
+ char *c = utf8_strcasestr(str, pattern);
if (c == NULL) {
score = INT32_MIN;
break;
@@ -55,7 +56,7 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
{
int32_t score = 0;
char *saveptr = NULL;
- char *tmp = xstrdup(patterns);
+ char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
int32_t word_score = fuzzy_match(pattern, str);
@@ -78,8 +79,8 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
const int unmatched_letter_penalty = -1;
- const size_t slen = strlen(str);
- const size_t plen = strlen(pattern);
+ const size_t slen = utf8_strlen(str);
+ const size_t plen = utf8_strlen(pattern);
int32_t score = 0;
if (*pattern == '\0') {
@@ -119,7 +120,7 @@ int32_t fuzzy_match_recurse(
}
const char *match = str;
- const char search[2] = { *pattern, '\0' };
+ uint32_t search = utf8_get_char(pattern);
int32_t best_score = INT32_MIN;
@@ -127,11 +128,15 @@ int32_t fuzzy_match_recurse(
* Find all occurrences of the next pattern character in str, and
* recurse on them.
*/
- while ((match = strcasestr(match, search)) != NULL) {
+ while ((match = utf8_strcasechr(match, search)) != NULL) {
+ int32_t jump = 0;
+ for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
+ jump++;
+ }
int32_t subscore = fuzzy_match_recurse(
- pattern + 1,
- match + 1,
- compute_score(match - str, first_char, match),
+ utf8_next_char(pattern),
+ utf8_next_char(match),
+ compute_score(jump, first_char, match),
false);
best_score = MAX(best_score, subscore);
match++;
@@ -172,15 +177,18 @@ int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
int32_t score = 0;
+ const uint32_t cur = utf8_get_char(match);
+
/* Apply bonuses. */
if (!first_char && jump == 0) {
score += adjacency_bonus;
}
if (!first_char || jump > 0) {
- if (isupper(*match) && islower(*(match - 1))) {
+ const uint32_t prev = utf8_get_char(utf8_prev_char(match));
+ if (utf8_isupper(cur) && utf8_islower(prev)) {
score += camel_bonus;
}
- if (isalnum(*match) && !isalnum(*(match - 1))) {
+ if (utf8_isalnum(cur) && !utf8_isalnum(prev)) {
score += separator_bonus;
}
}
diff --git a/src/main.c b/src/main.c
index 212211d..043f6a8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -25,6 +25,7 @@
#include "nelem.h"
#include "shm.h"
#include "string_vec.h"
+#include "string_vec.h"
#include "xmalloc.h"
#undef MAX
diff --git a/src/string_vec.c b/src/string_vec.c
index 50dd813..2ef40a2 100644
--- a/src/string_vec.c
+++ b/src/string_vec.c
@@ -6,6 +6,7 @@
#include <sys/mman.h>
#include "fuzzy_match.h"
#include "string_vec.h"
+#include "utf8.h"
#include "xmalloc.h"
static int cmpstringp(const void *restrict a, const void *restrict b)
@@ -80,7 +81,10 @@ void string_vec_add(struct string_vec *restrict vec, const char *restrict str)
vec->size *= 2;
vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
}
- vec->buf[vec->count].string = xstrdup(str);
+ vec->buf[vec->count].string = utf8_normalize(str);
+ if (vec->buf[vec->count].string == NULL) {
+ vec->buf[vec->count].string = xstrdup(str);
+ }
vec->buf[vec->count].search_score = 0;
vec->buf[vec->count].history_score = 0;
vec->count++;
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..7ed6046
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,92 @@
+#include <string.h>
+
+#include "utf8.h"
+
+uint32_t utf8_isupper(uint32_t c)
+{
+ return g_unichar_isupper(c);
+}
+
+uint32_t utf8_islower(uint32_t c)
+{
+ return g_unichar_islower(c);
+}
+
+uint32_t utf8_isalnum(uint32_t c)
+{
+ return g_unichar_isalnum(c);
+}
+
+uint32_t utf8_toupper(uint32_t c)
+{
+ return g_unichar_toupper(c);
+}
+
+uint32_t utf8_tolower(uint32_t c)
+{
+ return g_unichar_tolower(c);
+}
+
+uint32_t utf8_get_char(const char *s)
+{
+ return g_utf8_get_char(s);
+}
+
+char *utf8_next_char(const char *s)
+{
+ return g_utf8_next_char(s);
+}
+
+char *utf8_prev_char(const char *s)
+{
+ return g_utf8_prev_char(s);
+}
+
+char *utf8_strchr(const char *s, uint32_t c)
+{
+ return g_utf8_strchr(s, -1, c);
+}
+
+char *utf8_strcasechr(const char *s, uint32_t c)
+{
+ c = g_unichar_tolower(c);
+
+ const char *p = s;
+ while (*p != '\0' && g_unichar_tolower(g_utf8_get_char(p)) != c) {
+ p = g_utf8_next_char(p);
+ }
+ if (*p == '\0') {
+ return NULL;
+ }
+ return (char *)p;
+}
+
+size_t utf8_strlen(const char *s)
+{
+ return g_utf8_strlen(s, -1);
+}
+
+char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle)
+{
+ char *h = g_utf8_casefold(haystack, -1);
+ char *n = g_utf8_casefold(needle, -1);
+
+ char *cmp = strstr(h, n);
+ char *ret;
+
+ if (cmp == NULL) {
+ ret = NULL;
+ } else {
+ ret = (char *)haystack + (cmp - h);
+ }
+
+ free(h);
+ free(n);
+
+ return ret;
+}
+
+char *utf8_normalize(const char *s)
+{
+ return g_utf8_normalize(s, -1, G_NORMALIZE_DEFAULT);
+}
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..b6ee986
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,22 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <glib.h>
+#include <stdint.h>
+
+uint32_t utf8_isupper(uint32_t c);
+uint32_t utf8_islower(uint32_t c);
+uint32_t utf8_isalnum(uint32_t c);
+uint32_t utf8_toupper(uint32_t c);
+uint32_t utf8_tolower(uint32_t c);
+
+uint32_t utf8_get_char(const char *s);
+char *utf8_next_char(const char *s);
+char *utf8_prev_char(const char *s);
+char *utf8_strchr(const char *s, uint32_t c);
+char *utf8_strcasechr(const char *s, uint32_t c);
+size_t utf8_strlen(const char *s);
+char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle);
+char *utf8_normalize(const char *s);
+
+#endif /* UTF8_H */