Skip to content

Commit 01a50e4

Browse files
encukousethmlarsonch4n3-yoonStanFromIrelandpicnixz
authored andcommitted
[3.10] gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() (GH-149080)
Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. (cherry picked from commit 991224b) Co-authored-by: Seth Larson <seth@python.org> Co-authored-by: ch4n3-yoon <ch4n3.yoon@gmail.com> Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com> Co-authored-by: Stan Ulbrych <stan@python.org> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
1 parent b286d98 commit 01a50e4

3 files changed

Lines changed: 150 additions & 26 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,34 @@ def test_issue10254(self):
202202
b = 'C\u0338' * 20 + '\xC7'
203203
self.assertEqual(self.db.normalize('NFC', a), b)
204204

205+
def test_long_combining_mark_run(self):
206+
# gh-149079: avoid quadratic canonical ordering.
207+
payload = "a" + ("\u0300\u0327" * 32)
208+
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
209+
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
210+
211+
self.assertEqual(self.db.normalize("NFD", payload), nfd)
212+
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
213+
self.assertEqual(self.db.normalize("NFC", payload), nfc)
214+
self.assertEqual(self.db.normalize("NFKC", payload), nfc)
215+
216+
def test_combining_mark_run_fast_paths(self):
217+
# gh-149079: cover short runs and already-sorted long runs.
218+
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
219+
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
220+
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
221+
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
222+
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
223+
224+
self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
225+
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
226+
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
227+
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
228+
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
229+
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
230+
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
231+
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
232+
205233
def test_issue29456(self):
206234
# Fix #29456
207235
u1176_str_a = '\u1100\u1176\u11a8'
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix a potential denial of service in :func:`unicodedata.normalize`. The
2+
canonical ordering step of Unicode normalization used a quadratic-time insertion
3+
sort for reordering combining characters, which could be exploited with
4+
crafted input containing many combining characters in non-canonical order.
5+
Replaced with a linear-time counting sort for long runs.

Modules/unicodedata.c

Lines changed: 117 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,64 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
485485
(*index)++;
486486
}
487487

488+
/* Small combining runs are usually cheaper with insertion sort. */
489+
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
490+
491+
static void
492+
canonical_ordering_sort_insertion(int kind, void *data,
493+
Py_ssize_t start, Py_ssize_t end)
494+
{
495+
for (Py_ssize_t i = start + 1; i < end; i++) {
496+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
497+
unsigned char combining = _getrecord_ex(code)->combining;
498+
Py_ssize_t j = i;
499+
500+
while (j > start) {
501+
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
502+
if (_getrecord_ex(previous)->combining <= combining) {
503+
break;
504+
}
505+
PyUnicode_WRITE(kind, data, j, previous);
506+
j--;
507+
}
508+
if (j != i) {
509+
PyUnicode_WRITE(kind, data, j, code);
510+
}
511+
}
512+
}
513+
514+
static void
515+
canonical_ordering_sort_counting(int kind, void *data,
516+
Py_ssize_t start, Py_ssize_t end,
517+
Py_UCS4 *sortbuf)
518+
{
519+
Py_ssize_t counts[256] = {0};
520+
Py_ssize_t run_length = end - start;
521+
Py_ssize_t total = 0;
522+
523+
for (Py_ssize_t i = start; i < end; i++) {
524+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
525+
unsigned char combining = _getrecord_ex(code)->combining;
526+
counts[combining]++;
527+
}
528+
529+
for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
530+
Py_ssize_t count = counts[i];
531+
counts[i] = total;
532+
total += count;
533+
}
534+
535+
/* Reuse counts[] as the next output slot for each CCC. */
536+
for (Py_ssize_t i = start; i < end; i++) {
537+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
538+
unsigned char combining = _getrecord_ex(code)->combining;
539+
sortbuf[counts[combining]++] = code;
540+
}
541+
for (Py_ssize_t i = 0; i < run_length; i++) {
542+
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
543+
}
544+
}
545+
488546
#define SBase 0xAC00
489547
#define LBase 0x1100
490548
#define VBase 0x1161
@@ -501,13 +559,16 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
501559
PyObject *result;
502560
Py_UCS4 *output;
503561
Py_ssize_t i, o, osize;
504-
int kind;
505-
const void *data;
562+
int input_kind, result_kind;
563+
const void *input_data;
564+
void *result_data;
506565
/* Longest decomposition in Unicode 3.2: U+FDFA */
507566
Py_UCS4 stack[20];
508567
Py_ssize_t space, isize;
509568
int index, prefix, count, stackptr;
510569
unsigned char prev, cur;
570+
Py_UCS4 *sortbuf = NULL;
571+
Py_ssize_t sortbuflen = 0;
511572

512573
stackptr = 0;
513574
isize = PyUnicode_GET_LENGTH(input);
@@ -527,11 +588,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
527588
return NULL;
528589
}
529590
i = o = 0;
530-
kind = PyUnicode_KIND(input);
531-
data = PyUnicode_DATA(input);
591+
input_kind = PyUnicode_KIND(input);
592+
input_data = PyUnicode_DATA(input);
532593

533594
while (i < isize) {
534-
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
595+
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
535596
while(stackptr) {
536597
Py_UCS4 code = stack[--stackptr];
537598
/* Hangul Decomposition adds three characters in
@@ -597,34 +658,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
597658
if (!result)
598659
return NULL;
599660
/* result is guaranteed to be ready, as it is compact. */
600-
kind = PyUnicode_KIND(result);
601-
data = PyUnicode_DATA(result);
661+
result_kind = PyUnicode_KIND(result);
662+
result_data = PyUnicode_DATA(result);
602663

603-
/* Sort canonically. */
664+
/* Sort each consecutive combining-character run canonically. */
604665
i = 0;
605-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606-
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
607-
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
608-
if (prev == 0 || cur == 0 || prev <= cur) {
609-
prev = cur;
666+
while (i < o) {
667+
Py_ssize_t run_length, run_start;
668+
int needs_sort = 0;
669+
670+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
671+
prev = _getrecord_ex(ch)->combining;
672+
if (prev == 0) {
673+
i++;
610674
continue;
611675
}
612-
/* Non-canonical order. Need to switch *i with previous. */
613-
o = i - 1;
614-
while (1) {
615-
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
616-
PyUnicode_WRITE(kind, data, o+1,
617-
PyUnicode_READ(kind, data, o));
618-
PyUnicode_WRITE(kind, data, o, tmp);
619-
o--;
620-
if (o < 0)
621-
break;
622-
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
623-
if (prev == 0 || prev <= cur)
676+
677+
run_start = i++;
678+
while (i < o) {
679+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
680+
cur = _getrecord_ex(ch)->combining;
681+
if (cur == 0) {
624682
break;
683+
}
684+
if (prev > cur) {
685+
needs_sort = 1;
686+
}
687+
prev = cur;
688+
i++;
689+
}
690+
if (!needs_sort) {
691+
continue;
692+
}
693+
694+
run_length = i - run_start;
695+
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
696+
canonical_ordering_sort_insertion(result_kind, result_data,
697+
run_start, i);
698+
continue;
625699
}
626-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
700+
701+
if (run_length > sortbuflen) {
702+
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
703+
Py_UCS4,
704+
run_length);
705+
if (new_sortbuf == NULL) {
706+
PyErr_NoMemory();
707+
PyMem_Free(sortbuf);
708+
Py_DECREF(result);
709+
return NULL;
710+
}
711+
sortbuf = new_sortbuf;
712+
sortbuflen = run_length;
713+
}
714+
715+
canonical_ordering_sort_counting(result_kind, result_data,
716+
run_start, i, sortbuf);
627717
}
718+
PyMem_Free(sortbuf);
628719
return result;
629720
}
630721

0 commit comments

Comments
 (0)