Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d5cfd08
utf8-utils.cpp: constexpr std::array for constants
Nov 19, 2025
03786a3
implemented core of f$prepare_search_query
Nov 21, 2025
8a3942a
clean up a bit
Nov 21, 2025
b3dd2f1
RuntimeContext::get().static_SB used for prepare_search_query
Nov 24, 2025
232a5e8
fixes
Nov 24, 2025
5c60f22
std::addressof
Nov 24, 2025
7f85777
removed 2025
Nov 24, 2025
7204dfa
f$prepare_search_query: split string_buf into 4 spans
Nov 25, 2025
9b9070b
small clean up
Nov 25, 2025
ac810b1
removed alignment of span sizes
Nov 25, 2025
2433a71
removed attributes "@kphp-extern-func-info stub generation-required" …
Nov 25, 2025
82d5b64
added test_prepare_search_query.py and one test case
Nov 26, 2025
270ff25
#include "auto/common/unicode-utils-auto.h" moved to .cpp
Nov 26, 2025
fd49228
added test_prepare_search_query.py test
Nov 27, 2025
877b96c
json data replaced with raw binary data
Nov 27, 2025
a2ef9cf
added utf-8 examples for prepare_search_query test
Nov 27, 2025
7122494
added newline to test files
Nov 27, 2025
37aefd3
relative path for example files
Nov 28, 2025
e9b49ef
removed args from component-config.yaml
Nov 28, 2025
2296f42
common prepare_search_query_impl
Mar 12, 2026
892dca1
fixed
Mar 12, 2026
68c095f
fmt
Mar 12, 2026
04d8d5e
brace init
Mar 12, 2026
2eb7524
std::function removed with function pointer
Mar 12, 2026
d221bd8
fmt
Mar 12, 2026
463cda2
removed unused asserts
Mar 13, 2026
7a10bbb
assertf passed to unicode_toupper() and unicode_tolower()
Mar 13, 2026
61bf6a0
fmt
Mar 13, 2026
001012b
minor fix
apolyakov Mar 16, 2026
9fc61b8
add more test cases
apolyakov Mar 16, 2026
52302be
check newline for c++ files only
apolyakov Mar 16, 2026
df55a37
revert change for check new lines and add new lines to tests
apolyakov Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions builtin-functions/kphp-light/stdlib/server-functions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false;

function memory_get_detailed_stats() ::: int[];

function prepare_search_query ($query ::: string) ::: string;

function memory_get_total_usage() ::: int;

function inet_pton ($address ::: string) ::: string | false;
Expand Down Expand Up @@ -131,7 +133,3 @@ function flush() ::: void;
define('PHP_QUERY_RFC1738', 1);
define('PHP_QUERY_RFC3986', 2);


/** @kphp-extern-func-info stub generation-required */
function prepare_search_query ($query ::: string) ::: string;

201 changes: 105 additions & 96 deletions common/unicode/unicode-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

#include "common/unicode/unicode-utils.h"

#include <algorithm>
#include <assert.h>
#include <cstddef>
#include <iterator>
#include <stdlib.h>
#include <string.h>

Expand All @@ -13,7 +16,7 @@
#include "common/unicode/utf8-utils.h"

/* Search generated ranges for specified character */
static int binary_search_ranges(const int* ranges, int r, int code) {
static int binary_search_ranges(const int* ranges, int r, int code, void (*assertf)(bool)) {
if ((unsigned int)code > 0x10ffff) {
return 0;
}
Expand Down Expand Up @@ -43,152 +46,158 @@ static int binary_search_ranges(const int* ranges, int r, int code) {
case 2:
return ((code - 1) | 1);
default:
assert(0);
exit(1);
if (assertf != nullptr) {
assertf(false);
}
}
return 0;
}

/* Convert character to upper case */
int unicode_toupper(int code) {
int unicode_toupper(int code, void (*assertf)(bool)) {
if ((unsigned int)code < (unsigned int)TABLE_SIZE) {
return to_upper_table[code];
} else {
return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code);
return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, assertf);
}
}

/* Convert character to lower case */
int unicode_tolower(int code) {
int unicode_tolower(int code, void (*assertf)(bool)) {
if ((unsigned int)code < (unsigned int)TABLE_SIZE) {
return to_lower_table[code];
} else {
return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code);
return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, assertf);
}
}

inline constexpr int32_t WHITESPACE_CODE_POINT{static_cast<int32_t>(' ')};
inline constexpr int32_t PLUS_CODE_POINT{static_cast<int32_t>('+')};

/* Prepares unicode 0-terminated string input for search,
leaving only digits and letters with diacritics.
Length of string can decrease.
Returns length of result. */
int prepare_search_string(int* input) {
int i;
int* output = input;
for (i = 0; input[i]; i++) {
int c = input[i], new_c;
if ((unsigned int)c < (unsigned int)TABLE_SIZE) {
new_c = prepare_table[c];
size_t prepare_search_string(int32_t* code_points, void (*assertf)(bool)) noexcept {
size_t output_size{};
for (size_t i{}; code_points[i] != 0; ++i) {
int32_t c{code_points[i]};
int32_t new_c{};
if (static_cast<size_t>(c) < static_cast<size_t>(TABLE_SIZE)) {
new_c = static_cast<int32_t>(prepare_table[c]);
} else {
new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c);
new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c, assertf);
}
if (new_c) {
if (new_c != 0x20 || (output > input && output[-1] != 0x20)) {
*output++ = new_c;
if (new_c != 0) {
// we forbid 2 whitespaces after each other and starting whitespace
if (new_c != WHITESPACE_CODE_POINT || (output_size > 0 && code_points[output_size - 1] != WHITESPACE_CODE_POINT)) {
code_points[output_size++] = new_c;
}
}
}
if (output > input && output[-1] == 0x20) {
output--;
if (output_size > 0 && code_points[output_size - 1] == WHITESPACE_CODE_POINT) {
// throw out terminating whitespace
--output_size;
}
*output = 0;
return output - input;
}

#define MAX_NAME_SIZE 65536
static char prep_buf[4 * MAX_NAME_SIZE + 4];
int prep_ibuf[MAX_NAME_SIZE + 4];
static int prep_ibuf_res[MAX_NAME_SIZE + 4];
static int* words_ibuf[MAX_NAME_SIZE + 4];

int stricmp_void(const void* x, const void* y) {
const int* s1 = *(const int**)x;
const int* s2 = *(const int**)y;
while (*s1 == *s2 && *s1 != ' ')
s1++, s2++;
return *s1 - *s2;
code_points[output_size] = 0;
return output_size;
}

int* prepare_str_unicode(const int* x) {
int* v = prep_ibuf;

int n;
if (v != x) {
for (n = 0; x[n]; n++) {
v[n] = x[n];
inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, void (*assertf)(bool)) noexcept {
size_t code_points_length = prepare_search_string(code_points, assertf);
code_points[code_points_length] = WHITESPACE_CODE_POINT;

size_t words_count{};
size_t i{};
// looking for the beginnings of the words
while (i < code_points_length) {
word_start_indices[words_count++] = i;
while (i < code_points_length && code_points[i] != WHITESPACE_CODE_POINT) {
++i;
}
v[n] = 0;
++i;
}

n = prepare_search_string(v);
v[n] = ' ';

int i = 0, k = 0;
while (i < n) {
words_ibuf[k++] = v + i;
while (v[i] && v[i] != ' ') {
i++;
auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool {
while (code_points[x] != WHITESPACE_CODE_POINT && code_points[x] == code_points[y]) {
++x;
++y;
}
i++;
}
if (code_points[x] == WHITESPACE_CODE_POINT) {
return code_points[y] != WHITESPACE_CODE_POINT;
}
if (code_points[y] == WHITESPACE_CODE_POINT) {
return false;
}
return code_points[x] < code_points[y];
}};

qsort(words_ibuf, (size_t)k, sizeof(int*), stricmp_void);
std::sort(word_start_indices, std::next(word_start_indices, words_count), word_less_cmp);

int j = 0;
for (i = 0; i < k; i++) {
if (j == 0 || stricmp_void(&words_ibuf[j - 1], &words_ibuf[i])) {
words_ibuf[j++] = words_ibuf[i];
size_t uniq_words_count{};
for (i = 0; i < words_count; ++i) {
// drop duplicates
if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) {
word_start_indices[uniq_words_count++] = word_start_indices[i];
} else {
words_ibuf[j - 1] = words_ibuf[i];
word_start_indices[uniq_words_count - 1] = word_start_indices[i];
}
}
k = j;

int* res = prep_ibuf_res;
for (i = 0; i < k; i++) {
int* tmp = words_ibuf[i];
while (*tmp != ' ') {
*res++ = *tmp++;
size_t result_size{};
// output words with '+' separator
for (i = 0; i < uniq_words_count; ++i) {
size_t ind{word_start_indices[i]};
while (code_points[ind] != WHITESPACE_CODE_POINT) {
prepared_code_points[result_size++] = code_points[ind++];
}
*res++ = '+';
prepared_code_points[result_size++] = PLUS_CODE_POINT;
}
*res++ = 0;
prepared_code_points[result_size++] = 0;

assert(res - prep_ibuf_res < MAX_NAME_SIZE);
return prep_ibuf_res;
assertf(result_size < MAX_NAME_SIZE);
return result_size;
}

const char* clean_str_unicode(const int* xx) {
assert(xx != NULL);

int* v = prepare_str_unicode(xx);
int l = put_string_utf8(v, prep_buf);
assert(l < sizeof(prep_buf));

char *s = prep_buf, *x = prep_buf;
int skip;

while (*x != 0) {
skip = !strncmp(x, "amp+", 4) || !strncmp(x, "gt+", 3) || !strncmp(x, "lt+", 3) || !strncmp(x, "quot+", 5) || !strncmp(x, "ft+", 3) ||
!strncmp(x, "feat+", 5) ||
(((x[0] == '1' && x[1] == '9') || (x[0] == '2' && x[1] == '0')) && ('0' <= x[2] && x[2] <= '9') && ('0' <= x[3] && x[3] <= '9') && x[4] == '+') ||
!strncmp(x, "092+", 4) || !strncmp(x, "33+", 3) || !strncmp(x, "34+", 3) || !strncmp(x, "36+", 3) || !strncmp(x, "39+", 3) ||
!strncmp(x, "60+", 3) || !strncmp(x, "62+", 3) || !strncmp(x, "8232+", 5) || !strncmp(x, "8233+", 5);
inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result,
void (*assertf)(bool)) noexcept {
prepare_str_unicode(code_points, word_start_indices, prepared_code_points, assertf);

auto length{static_cast<size_t>(put_string_utf8(prepared_code_points, reinterpret_cast<char*>(utf8_result)))};
assertf(length < MAX_NAME_BYTES_SIZE);

size_t i{};
size_t result_size{};
while (i < length) {
char* c{reinterpret_cast<char*>(std::addressof(utf8_result[i]))};
bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || !strncmp(c, "ft+", 3) ||
!strncmp(c, "feat+", 5) ||
(((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') ||
!strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || !strncmp(c, "39+", 3) ||
!strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || !strncmp(c, "8233+", 5)};
do {
*s = *x;
if (!skip) {
s++;
utf8_result[result_size] = utf8_result[i];
++result_size;
}
} while (*x++ != '+');
} while (utf8_result[i++] != static_cast<std::byte>('+'));
}
*s = 0;
utf8_result[result_size] = static_cast<std::byte>(0);

return prep_buf;
return result_size;
}

const char* clean_str(const char* x) {
if (x == NULL || strlen(x) >= MAX_NAME_SIZE) {
return x;
size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result,
void (*assertf)(bool)) {
size_t x_len{strlen(x)};
if (assertf == nullptr || x == NULL || x_len >= MAX_NAME_SIZE) {
for (size_t i = 0; i < x_len; ++i) {
utf8_result[i] = static_cast<std::byte>(x[i]);
}
utf8_result[x_len] = static_cast<std::byte>(0);
return x_len;
}

html_string_to_utf8(x, prep_ibuf);
return clean_str_unicode(prep_ibuf);
html_string_to_utf8(x, code_points);
return clean_str_unicode(code_points, word_start_indices, prepared_code_points, utf8_result, assertf);
}
13 changes: 10 additions & 3 deletions common/unicode/unicode-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

#pragma once

int unicode_toupper(int code);
int unicode_tolower(int code);
const char* clean_str(const char* x);
#include <cstddef>
#include <cstdint>

inline constexpr size_t MAX_NAME_SIZE = 65536;
inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4;
inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4;

int unicode_toupper(int code, void (*assertf)(bool));
int unicode_tolower(int code, void (*assertf)(bool));
size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, void (*assertf)(bool));
Loading
Loading