Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,126 changes: 361 additions & 765 deletions scripts/unicode.py

Large diffs are not rendered by default.

119 changes: 119 additions & 0 deletions src/gen/lookup.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
use crate::tables::*;
use crate::width_info::WidthInfo;

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
/// consulting a multi-level lookup table.
///
/// # Maintenance
/// The tables themselves are autogenerated but this function is hardcoded. You should have
/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
#[inline]
pub(crate) fn lookup_width(c: char) -> (u8, WidthInfo) {
let cp = c as usize;

let t1_offset = WIDTH_ROOT.0[cp >> 13];

// Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
// so each sub-table is 128 bytes in size.
// (Sub-tables are selected using the computed offset from the previous table.)
let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];

// Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
// This is accomplished by packing four stored entries into one byte.
// So each sub-table is 2**(7-2) == 32 bytes in size.
// Since this is the last table, each entry represents an encoded width.
let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];

// Extract the packed width
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;

if width < 3 {
(width, WidthInfo::DEFAULT)
} else {
match c {
'\u{A}' => (1, WidthInfo::LINE_FEED),
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
'\u{FE01}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
'\u{FE0E}' => (0, WidthInfo::VARIATION_SELECTOR_15),
'\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
'\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
'\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
'\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
'\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
'\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
_ => (2, WidthInfo::EMOJI_PRESENTATION),
}
}
}
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
/// consulting a multi-level lookup table.
///
/// # Maintenance
/// The tables themselves are autogenerated but this function is hardcoded. You should have
/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
#[cfg(feature = "cjk")]
#[inline]
pub(crate) fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
let cp = c as usize;

let t1_offset = WIDTH_ROOT_CJK.0[cp >> 13];

// Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
// so each sub-table is 128 bytes in size.
// (Sub-tables are selected using the computed offset from the previous table.)
let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];

// Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
// This is accomplished by packing four stored entries into one byte.
// So each sub-table is 2**(7-2) == 32 bytes in size.
// Since this is the last table, each entry represents an encoded width.
let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];

// Extract the packed width
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;

if width < 3 {
(width, WidthInfo::DEFAULT)
} else {
match c {
'\u{A}' => (1, WidthInfo::LINE_FEED),
'\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
'\u{FE00}'..='\u{FE02}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
'\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
'\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
'\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
'\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
'\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
'\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
_ => (2, WidthInfo::EMOJI_PRESENTATION),
}
}
}
124 changes: 124 additions & 0 deletions src/gen/props.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
use crate::tables::*;
use core::cmp::Ordering;

/// Whether this character has Joining_Group=Lam.
pub fn is_joining_group_lam(c: char) -> bool {
matches!(
c,
'\u{644}' | '\u{6B5}'..='\u{6B8}' | '\u{76A}' | '\u{8A6}' | '\u{8C7}'
)
}

/// Whether this character is a default-ignorable combining mark
/// or ZWJ. These characters won't interrupt non-Arabic ligatures.
pub fn is_ligature_transparent(c: char) -> bool {
matches!(
c,
'\u{34F}' | '\u{17B4}'..='\u{17B5}' | '\u{180B}'..='\u{180D}' | '\u{180F}' | '\u{200D}' | '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}'
)
}

/// Whether this character forms an [emoji presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\u{FEOF}'`.
/// Emoji presentation sequences are considered to have width 2.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {
let cp: u32 = c.into();
// First level of lookup uses all but 10 LSB
let top_bits = cp >> 10;
let idx_of_leaf: usize = match top_bits {
0x0 => 0,
0x8 => 1,
0x9 => 2,
0xA => 3,
0xC => 4,
0x7C => 5,
0x7D => 6,
_ => return false,
};
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
// and use them to index into `leaf_row`.
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
// Use the 3 LSB of `cp` to index into `leaf_byte`.
((leaf_byte >> (cp & 7)) & 1) == 1
}

/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
/// when followed by `'\u{FEOE}'`, and is not ideographic.
/// Such sequences are considered to have width 1.
#[inline]
pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
let cp: u32 = c.into();
// First level of lookup uses all but 8 LSB
let top_bits = cp >> 8;
let leaf: &[(u8, u8)] = match top_bits {
0x23 => &TEXT_PRESENTATION_LEAF_0,
0x25 => &TEXT_PRESENTATION_LEAF_1,
0x26 => &TEXT_PRESENTATION_LEAF_2,
0x27 => &TEXT_PRESENTATION_LEAF_3,
0x2B => &TEXT_PRESENTATION_LEAF_4,
0x1F0 => &TEXT_PRESENTATION_LEAF_5,
0x1F3 => &TEXT_PRESENTATION_LEAF_6,
0x1F4 => &TEXT_PRESENTATION_LEAF_7,
0x1F5 => &TEXT_PRESENTATION_LEAF_8,
0x1F6 => &TEXT_PRESENTATION_LEAF_9,
_ => return false,
};

let bottom_bits = (cp & 0xFF) as u8;
leaf.binary_search_by(|&(lo, hi)| {
if bottom_bits < lo {
Ordering::Greater
} else if bottom_bits > hi {
Ordering::Less
} else {
Ordering::Equal
}
})
.is_ok()
}

/// Returns `true` if `c` is an `Emoji_Modifier_Base`.
#[inline]
pub fn is_emoji_modifier_base(c: char) -> bool {
let cp: u32 = c.into();
// First level of lookup uses all but 8 LSB
let top_bits = cp >> 8;
let leaf: &[(u8, u8)] = match top_bits {
0x26 => &EMOJI_MODIFIER_LEAF_0,
0x27 => &EMOJI_MODIFIER_LEAF_1,
0x1F3 => &EMOJI_MODIFIER_LEAF_2,
0x1F4 => &EMOJI_MODIFIER_LEAF_3,
0x1F5 => &EMOJI_MODIFIER_LEAF_4,
0x1F6 => &EMOJI_MODIFIER_LEAF_5,
0x1F9 => &EMOJI_MODIFIER_LEAF_6,
0x1FA => &EMOJI_MODIFIER_LEAF_7,
_ => return false,
};

let bottom_bits = (cp & 0xFF) as u8;
leaf.binary_search_by(|&(lo, hi)| {
if bottom_bits < lo {
Ordering::Greater
} else if bottom_bits > hi {
Ordering::Less
} else {
Ordering::Equal
}
})
.is_ok()
}
Loading