Speed up string vector filtering and loading.

We can assume that string vectors generated by tofi are already normalised, so don't waste time normalising them again. Also make sure to validate strings from the user.
author: Phil Jones <philj56@gmail.com> 2022-11-24 12:48:39 +0000
committer: Phil Jones <philj56@gmail.com> 2022-11-24 12:48:39 +0000
commit: 03fc67cc74cc08fff5eb61e855b54174683614ff (patch)
tree: dc8376aa977a0cff31ebc0d4283cb77cce0e934e
parent: e19d821a7ec5d8b8d528be93626bbc2effd15320 (diff)
3 files changed, 33 insertions, 2 deletions
diff --git a/src/string_vec.c b/src/string_vec.c
index 8c59607..342fd9f 100644
--- a/src/string_vec.c
+++ b/src/string_vec.c
@@ -83,6 +83,9 @@ struct string_vec string_vec_copy(const struct string_vec *restrict vec)
 
 void string_vec_add(struct string_vec *restrict vec, const char *restrict str)
 {
+	if (!utf8_validate(str)) {
+		return;
+	}
 	if (vec->count == vec->size) {
 		vec->size *= 2;
 		vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
@@ -96,6 +99,19 @@ void string_vec_add(struct string_vec *restrict vec, const char *restrict str)
 	vec->count++;
 }
 
+/* Same as string_vec_add(), but assume str is normalized for speed. */
+static void string_vec_add_normalized(struct string_vec *restrict vec, const char *restrict str)
+{
+	if (vec->count == vec->size) {
+		vec->size *= 2;
+		vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
+	}
+	vec->buf[vec->count].string = xstrdup(str);
+	vec->buf[vec->count].search_score = 0;
+	vec->buf[vec->count].history_score = 0;
+	vec->count++;
+}
+
 void string_vec_sort(struct string_vec *restrict vec)
 {
 	qsort(vec->buf, vec->count, sizeof(vec->buf[0]), cmpstringp);
@@ -160,7 +176,11 @@ struct string_vec string_vec_filter(
 			search_score = fuzzy_match_simple_words(substr, vec->buf[i].string);
 		}
 		if (search_score != INT32_MIN) {
-			string_vec_add(&filt, vec->buf[i].string);
+			/*
+			 * Assume that the vector we're filtering is already
+			 * normalized.
+			 */
+			string_vec_add_normalized(&filt, vec->buf[i].string);
 			/*
 			 * Store the position of the match in the string as
 			 * its search_score, for later sorting.
@@ -191,7 +211,10 @@ struct string_vec string_vec_load(FILE *file)
 		if (line[bytes_read - 1] == '\n') {
 			line[bytes_read - 1] = '\0';
 		}
-		string_vec_add(&vec, line);
+		/*
+		 * Assume that the vector we're loading is already normalized.
+		 */
+		string_vec_add_normalized(&vec, line);
 	}
 	free(line);
 
diff --git a/src/unicode.c b/src/unicode.c
index 3833fb6..7ddc0d5 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -1,3 +1,4 @@
+#include <stdbool.h>
 #include <string.h>
 
 #include "unicode.h"
@@ -115,3 +116,8 @@ char *utf8_compose(const char *s)
 {
 	return g_utf8_normalize(s, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 }
+
+bool utf8_validate(const char *s)
+{
+	return g_utf8_validate(s, -1, NULL);
+}
diff --git a/src/unicode.h b/src/unicode.h
index e198231..d32303d 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -2,6 +2,7 @@
 #define UNICODE_H
 
 #include <glib.h>
+#include <stdbool.h>
 #include <stdint.h>
 
 uint8_t utf32_to_utf8(uint32_t c, char *buf);
@@ -24,5 +25,6 @@ size_t utf8_strlen(const char *s);
 char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle);
 char *utf8_normalize(const char *s);
 char *utf8_compose(const char *s);
+bool utf8_validate(const char *s);
 
 #endif /* UNICODE_H */
author	Phil Jones <philj56@gmail.com>	2022-11-24 12:48:39 +0000
committer	Phil Jones <philj56@gmail.com>	2022-11-24 12:48:39 +0000
commit	03fc67cc74cc08fff5eb61e855b54174683614ff (patch)
tree	dc8376aa977a0cff31ebc0d4283cb77cce0e934e
parent	e19d821a7ec5d8b8d528be93626bbc2effd15320 (diff)