diff options
author | Phil Jones <philj56@gmail.com> | 2022-10-18 19:33:41 +0100 |
---|---|---|
committer | Phil Jones <philj56@gmail.com> | 2022-10-18 19:33:41 +0100 |
commit | 5482f0be746a98bdd6b2c54183b54dd2ff2a0192 (patch) | |
tree | dc58c6a1c486432f853bd3a5f9f7c78767c7292a | |
parent | 8872f664671711b97e02fe97f31746b5e158e627 (diff) |
Improve UTF-8 handling.
This should allow case-insensitive matching for non-Latin characters,
and fix matching for characters with diacritics.
-rw-r--r-- | meson.build | 3 | ||||
-rw-r--r-- | src/desktop_vec.c | 6 | ||||
-rw-r--r-- | src/entry_backend/harfbuzz.c | 3 | ||||
-rw-r--r-- | src/entry_backend/pango.c | 3 | ||||
-rw-r--r-- | src/fuzzy_match.c | 32 | ||||
-rw-r--r-- | src/main.c | 1 | ||||
-rw-r--r-- | src/string_vec.c | 6 | ||||
-rw-r--r-- | src/utf8.c | 92 | ||||
-rw-r--r-- | src/utf8.h | 22 |
9 files changed, 152 insertions, 16 deletions
diff --git a/meson.build b/meson.build index 9f2dcec..97f8eae 100644 --- a/meson.build +++ b/meson.build @@ -108,6 +108,7 @@ tofi_sources = files( 'src/shm.c', 'src/string_vec.c', 'src/surface.c', + 'src/utf8.c', 'src/wlr-layer-shell-unstable-v1.c', 'src/xmalloc.c', ) @@ -119,6 +120,7 @@ compgen_sources = files( 'src/log.c', 'src/mkdirp.c', 'src/string_vec.c', + 'src/utf8.c', 'src/xmalloc.c' ) @@ -183,6 +185,7 @@ executable( executable( 'tofi-compgen', compgen_sources, + dependencies: [glib], install: false ) diff --git a/src/desktop_vec.c b/src/desktop_vec.c index df4218b..f559551 100644 --- a/src/desktop_vec.c +++ b/src/desktop_vec.c @@ -4,6 +4,7 @@ #include "fuzzy_match.h" #include "log.h" #include "string_vec.h" +#include "utf8.h" #include "xmalloc.h" static bool match_current_desktop(char * const *desktop_list, gsize length); @@ -42,7 +43,10 @@ void desktop_vec_add( vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0])); } vec->buf[vec->count].id = xstrdup(id); - vec->buf[vec->count].name = xstrdup(name); + vec->buf[vec->count].name = utf8_normalize(name); + if (vec->buf[vec->count].name == NULL) { + vec->buf[vec->count].name = xstrdup(name); + } vec->buf[vec->count].path = xstrdup(path); vec->buf[vec->count].keywords = xstrdup(keywords); vec->buf[vec->count].search_score = 0; diff --git a/src/entry_backend/harfbuzz.c b/src/entry_backend/harfbuzz.c index c5fc07e..734d305 100644 --- a/src/entry_backend/harfbuzz.c +++ b/src/entry_backend/harfbuzz.c @@ -5,6 +5,7 @@ #include "../entry.h" #include "../log.h" #include "../nelem.h" +#include "../utf8.h" #include "../xmalloc.h" /* @@ -368,7 +369,7 @@ void entry_backend_harfbuzz_update(struct entry *entry) char *postmatch = NULL; cairo_text_extents_t subextents; if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) { - char *match_pos = strcasestr(prematch, entry->input_mb); + char *match_pos = utf8_strcasestr(prematch, entry->input_mb); if (match_pos != NULL) { match = xstrdup(result); prematch_len = (match_pos - prematch); diff --git a/src/entry_backend/pango.c b/src/entry_backend/pango.c index 1f19bce..1cc7628 100644 --- a/src/entry_backend/pango.c +++ b/src/entry_backend/pango.c @@ -4,6 +4,7 @@ #include "../entry.h" #include "../log.h" #include "../nelem.h" +#include "../utf8.h" #include "../xmalloc.h" #undef MAX @@ -181,7 +182,7 @@ void entry_backend_pango_update(struct entry *entry) PangoRectangle ink_subrect; PangoRectangle logical_subrect; if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) { - char *match_pos = strcasestr(str, entry->input_mb); + char *match_pos = utf8_strcasestr(str, entry->input_mb); if (match_pos != NULL) { prematch_len = (match_pos - str); postmatch_len = strlen(str) - prematch_len - match_len; diff --git a/src/fuzzy_match.c b/src/fuzzy_match.c index 94c60e8..b52aa13 100644 --- a/src/fuzzy_match.c +++ b/src/fuzzy_match.c @@ -5,6 +5,7 @@ #include <string.h> #include "fuzzy_match.h" +#include "utf8.h" #include "xmalloc.h" #undef MAX @@ -30,10 +31,10 @@ int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *rest { int32_t score = 0; char *saveptr = NULL; - char *tmp = xstrdup(patterns); + char *tmp = utf8_normalize(patterns); char *pattern = strtok_r(tmp, " ", &saveptr); while (pattern != NULL) { - char *c = strcasestr(str, pattern); + char *c = utf8_strcasestr(str, pattern); if (c == NULL) { score = INT32_MIN; break; @@ -55,7 +56,7 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st { int32_t score = 0; char *saveptr = NULL; - char *tmp = xstrdup(patterns); + char *tmp = utf8_normalize(patterns); char *pattern = strtok_r(tmp, " ", &saveptr); while (pattern != NULL) { int32_t word_score = fuzzy_match(pattern, str); @@ -78,8 +79,8 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st int32_t fuzzy_match(const char *restrict pattern, const char *restrict str) { const int unmatched_letter_penalty = -1; - const size_t slen = strlen(str); - const size_t plen = strlen(pattern); + const size_t slen = utf8_strlen(str); + const size_t plen = utf8_strlen(pattern); int32_t score = 0; if (*pattern == '\0') { @@ -119,7 +120,7 @@ int32_t fuzzy_match_recurse( } const char *match = str; - const char search[2] = { *pattern, '\0' }; + uint32_t search = utf8_get_char(pattern); int32_t best_score = INT32_MIN; @@ -127,11 +128,15 @@ int32_t fuzzy_match_recurse( * Find all occurrences of the next pattern character in str, and * recurse on them. */ - while ((match = strcasestr(match, search)) != NULL) { + while ((match = utf8_strcasechr(match, search)) != NULL) { + int32_t jump = 0; + for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) { + jump++; + } int32_t subscore = fuzzy_match_recurse( - pattern + 1, - match + 1, - compute_score(match - str, first_char, match), + utf8_next_char(pattern), + utf8_next_char(match), + compute_score(jump, first_char, match), false); best_score = MAX(best_score, subscore); match++; @@ -172,15 +177,18 @@ int32_t compute_score(int32_t jump, bool first_char, const char *restrict match) int32_t score = 0; + const uint32_t cur = utf8_get_char(match); + /* Apply bonuses. */ if (!first_char && jump == 0) { score += adjacency_bonus; } if (!first_char || jump > 0) { - if (isupper(*match) && islower(*(match - 1))) { + const uint32_t prev = utf8_get_char(utf8_prev_char(match)); + if (utf8_isupper(cur) && utf8_islower(prev)) { score += camel_bonus; } - if (isalnum(*match) && !isalnum(*(match - 1))) { + if (utf8_isalnum(cur) && !utf8_isalnum(prev)) { score += separator_bonus; } } @@ -25,6 +25,7 @@ #include "nelem.h" #include "shm.h" #include "string_vec.h" +#include "string_vec.h" #include "xmalloc.h" #undef MAX diff --git a/src/string_vec.c b/src/string_vec.c index 50dd813..2ef40a2 100644 --- a/src/string_vec.c +++ b/src/string_vec.c @@ -6,6 +6,7 @@ #include <sys/mman.h> #include "fuzzy_match.h" #include "string_vec.h" +#include "utf8.h" #include "xmalloc.h" static int cmpstringp(const void *restrict a, const void *restrict b) @@ -80,7 +81,10 @@ void string_vec_add(struct string_vec *restrict vec, const char *restrict str) vec->size *= 2; vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0])); } - vec->buf[vec->count].string = xstrdup(str); + vec->buf[vec->count].string = utf8_normalize(str); + if (vec->buf[vec->count].string == NULL) { + vec->buf[vec->count].string = xstrdup(str); + } vec->buf[vec->count].search_score = 0; vec->buf[vec->count].history_score = 0; vec->count++; diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..7ed6046 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,92 @@ +#include <string.h> + +#include "utf8.h" + +uint32_t utf8_isupper(uint32_t c) +{ + return g_unichar_isupper(c); +} + +uint32_t utf8_islower(uint32_t c) +{ + return g_unichar_islower(c); +} + +uint32_t utf8_isalnum(uint32_t c) +{ + return g_unichar_isalnum(c); +} + +uint32_t utf8_toupper(uint32_t c) +{ + return g_unichar_toupper(c); +} + +uint32_t utf8_tolower(uint32_t c) +{ + return g_unichar_tolower(c); +} + +uint32_t utf8_get_char(const char *s) +{ + return g_utf8_get_char(s); +} + +char *utf8_next_char(const char *s) +{ + return g_utf8_next_char(s); +} + +char *utf8_prev_char(const char *s) +{ + return g_utf8_prev_char(s); +} + +char *utf8_strchr(const char *s, uint32_t c) +{ + return g_utf8_strchr(s, -1, c); +} + +char *utf8_strcasechr(const char *s, uint32_t c) +{ + c = g_unichar_tolower(c); + + const char *p = s; + while (*p != '\0' && g_unichar_tolower(g_utf8_get_char(p)) != c) { + p = g_utf8_next_char(p); + } + if (*p == '\0') { + return NULL; + } + return (char *)p; +} + +size_t utf8_strlen(const char *s) +{ + return g_utf8_strlen(s, -1); +} + +char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle) +{ + char *h = g_utf8_casefold(haystack, -1); + char *n = g_utf8_casefold(needle, -1); + + char *cmp = strstr(h, n); + char *ret; + + if (cmp == NULL) { + ret = NULL; + } else { + ret = (char *)haystack + (cmp - h); + } + + free(h); + free(n); + + return ret; +} + +char *utf8_normalize(const char *s) +{ + return g_utf8_normalize(s, -1, G_NORMALIZE_DEFAULT); +} diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..b6ee986 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,22 @@ +#ifndef UTF8_H +#define UTF8_H + +#include <glib.h> +#include <stdint.h> + +uint32_t utf8_isupper(uint32_t c); +uint32_t utf8_islower(uint32_t c); +uint32_t utf8_isalnum(uint32_t c); +uint32_t utf8_toupper(uint32_t c); +uint32_t utf8_tolower(uint32_t c); + +uint32_t utf8_get_char(const char *s); +char *utf8_next_char(const char *s); +char *utf8_prev_char(const char *s); +char *utf8_strchr(const char *s, uint32_t c); +char *utf8_strcasechr(const char *s, uint32_t c); +size_t utf8_strlen(const char *s); +char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle); +char *utf8_normalize(const char *s); + +#endif /* UTF8_H */ |