unicode-rs · Manishearth · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/scripts/unicode.py b/scripts/unicode.py
diff --git a/src/gen/lookup.rs b/src/gen/lookup.rs
@@ -0,0 +1,119 @@
+// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
+use crate::tables::*;
+use crate::width_info::WidthInfo;
+
+/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
+/// consulting a multi-level lookup table.
+///
+/// # Maintenance
+/// The tables themselves are autogenerated but this function is hardcoded. You should have
+/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
+/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
+/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
+#[inline]
+pub(crate) fn lookup_width(c: char) -> (u8, WidthInfo) {
+    let cp = c as usize;
+
+    let t1_offset = WIDTH_ROOT.0[cp >> 13];
+
+    // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
+    // so each sub-table is 128 bytes in size.
+    // (Sub-tables are selected using the computed offset from the previous table.)
+    let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];
+
+    // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
+    // This is accomplished by packing four stored entries into one byte.
+    // So each sub-table is 2**(7-2) == 32 bytes in size.
+    // Since this is the last table, each entry represents an encoded width.
+    let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];
+
+    // Extract the packed width
+    let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
+
+    if width < 3 {
+        (width, WidthInfo::DEFAULT)
+    } else {
+        match c {
+            '\u{A}' => (1, WidthInfo::LINE_FEED),
+            '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
+            '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
+            '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
+            '\u{17D8}' => (3, WidthInfo::DEFAULT),
+            '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
+            '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
+            '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
+            '\u{FE01}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
+            '\u{FE0E}' => (0, WidthInfo::VARIATION_SELECTOR_15),
+            '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
+            '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
+            '\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
+            '\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
+            '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
+            '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
+            _ => (2, WidthInfo::EMOJI_PRESENTATION),
+        }
+    }
+}
+/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
+/// consulting a multi-level lookup table.
+///
+/// # Maintenance
+/// The tables themselves are autogenerated but this function is hardcoded. You should have
+/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
+/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
+/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
+#[cfg(feature = "cjk")]
+#[inline]
+pub(crate) fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
+    let cp = c as usize;
+
+    let t1_offset = WIDTH_ROOT_CJK.0[cp >> 13];
+
+    // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
+    // so each sub-table is 128 bytes in size.
+    // (Sub-tables are selected using the computed offset from the previous table.)
+    let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];
+
+    // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
+    // This is accomplished by packing four stored entries into one byte.
+    // So each sub-table is 2**(7-2) == 32 bytes in size.
+    // Since this is the last table, each entry represents an encoded width.
+    let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];
+
+    // Extract the packed width
+    let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
+
+    if width < 3 {
+        (width, WidthInfo::DEFAULT)
+    } else {
+        match c {
+            '\u{A}' => (1, WidthInfo::LINE_FEED),
+            '\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
+            '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
+            '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
+            '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
+            '\u{17D8}' => (3, WidthInfo::DEFAULT),
+            '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
+            '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
+            '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
+            '\u{FE00}'..='\u{FE02}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
+            '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
+            '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
+            '\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
+            '\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
+            '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
+            '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
+            _ => (2, WidthInfo::EMOJI_PRESENTATION),
+        }
+    }
+}
diff --git a/src/gen/props.rs b/src/gen/props.rs
@@ -0,0 +1,124 @@
+// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
+use crate::tables::*;
+use core::cmp::Ordering;
+
+/// Whether this character has Joining_Group=Lam.
+pub fn is_joining_group_lam(c: char) -> bool {
+    matches!(
+        c,
+        '\u{644}' | '\u{6B5}'..='\u{6B8}' | '\u{76A}' | '\u{8A6}' | '\u{8C7}'
+    )
+}
+
+/// Whether this character is a default-ignorable combining mark
+/// or ZWJ. These characters won't interrupt non-Arabic ligatures.
+pub fn is_ligature_transparent(c: char) -> bool {
+    matches!(
+        c,
+        '\u{34F}' | '\u{17B4}'..='\u{17B5}' | '\u{180B}'..='\u{180D}' | '\u{180F}' | '\u{200D}' | '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}'
+    )
+}
+
+/// Whether this character forms an [emoji presentation sequence]
+/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+/// when followed by `'\u{FEOF}'`.
+/// Emoji presentation sequences are considered to have width 2.
+#[inline]
+pub fn starts_emoji_presentation_seq(c: char) -> bool {
+    let cp: u32 = c.into();
+    // First level of lookup uses all but 10 LSB
+    let top_bits = cp >> 10;
+    let idx_of_leaf: usize = match top_bits {
+        0x0 => 0,
+        0x8 => 1,
+        0x9 => 2,
+        0xA => 3,
+        0xC => 4,
+        0x7C => 5,
+        0x7D => 6,
+        _ => return false,
+    };
+    // Extract the 3-9th (0-indexed) least significant bits of `cp`,
+    // and use them to index into `leaf_row`.
+    let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
+    let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
+    // Use the 3 LSB of `cp` to index into `leaf_byte`.
+    ((leaf_byte >> (cp & 7)) & 1) == 1
+}
+
+/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence]
+/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
+/// when followed by `'\u{FEOE}'`, and is not ideographic.
+/// Such sequences are considered to have width 1.
+#[inline]
+pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
+    let cp: u32 = c.into();
+    // First level of lookup uses all but 8 LSB
+    let top_bits = cp >> 8;
+    let leaf: &[(u8, u8)] = match top_bits {
+        0x23 => &TEXT_PRESENTATION_LEAF_0,
+        0x25 => &TEXT_PRESENTATION_LEAF_1,
+        0x26 => &TEXT_PRESENTATION_LEAF_2,
+        0x27 => &TEXT_PRESENTATION_LEAF_3,
+        0x2B => &TEXT_PRESENTATION_LEAF_4,
+        0x1F0 => &TEXT_PRESENTATION_LEAF_5,
+        0x1F3 => &TEXT_PRESENTATION_LEAF_6,
+        0x1F4 => &TEXT_PRESENTATION_LEAF_7,
+        0x1F5 => &TEXT_PRESENTATION_LEAF_8,
+        0x1F6 => &TEXT_PRESENTATION_LEAF_9,
+        _ => return false,
+    };
+
+    let bottom_bits = (cp & 0xFF) as u8;
+    leaf.binary_search_by(|&(lo, hi)| {
+        if bottom_bits < lo {
+            Ordering::Greater
+        } else if bottom_bits > hi {
+            Ordering::Less
+        } else {
+            Ordering::Equal
+        }
+    })
+    .is_ok()
+}
+
+/// Returns `true` if `c` is an `Emoji_Modifier_Base`.
+#[inline]
+pub fn is_emoji_modifier_base(c: char) -> bool {
+    let cp: u32 = c.into();
+    // First level of lookup uses all but 8 LSB
+    let top_bits = cp >> 8;
+    let leaf: &[(u8, u8)] = match top_bits {
+        0x26 => &EMOJI_MODIFIER_LEAF_0,
+        0x27 => &EMOJI_MODIFIER_LEAF_1,
+        0x1F3 => &EMOJI_MODIFIER_LEAF_2,
+        0x1F4 => &EMOJI_MODIFIER_LEAF_3,
+        0x1F5 => &EMOJI_MODIFIER_LEAF_4,
+        0x1F6 => &EMOJI_MODIFIER_LEAF_5,
+        0x1F9 => &EMOJI_MODIFIER_LEAF_6,
+        0x1FA => &EMOJI_MODIFIER_LEAF_7,
+        _ => return false,
+    };
+
+    let bottom_bits = (cp & 0xFF) as u8;
+    leaf.binary_search_by(|&(lo, hi)| {
+        if bottom_bits < lo {
+            Ordering::Greater
+        } else if bottom_bits > hi {
+            Ordering::Less
+        } else {
+            Ordering::Equal
+        }
+    })
+    .is_ok()
+}